diff --git a/.latexmkrc b/.latexmkrc index 6c6740f..2ea38a8 100644 --- a/.latexmkrc +++ b/.latexmkrc @@ -33,6 +33,8 @@ sub makeglossaries { return system "makeglossaries", "-d", $path, $base_name; } +add_cus_dep( 'bbl', 'bib', 1, 'biber'); + sub biber { my ( $base_name, $path ) = fileparse( $_[0] ); my @args = ( "--output-directory", $path, $base_name ); diff --git a/Makefile b/Makefile index c3b67cf..b658d77 100755 --- a/Makefile +++ b/Makefile @@ -1,7 +1,6 @@ OPTIONS=-shell-escape -file-line-error -synctex=1 -interaction=batchmode SOURCE=report -all: build bib glossaries build build - +all: latexmk debug: lualatex -shell-escape -file-line-error $(SOURCE) @@ -9,12 +8,12 @@ build: lualatex $(OPTIONS) $(SOURCE) latexmk: - latexmk -gg -pdf $(SOURCE) + latexmk -pdf $(SOURCE) bib: - biber $(SOURCE) + biber --output-directory=build $(SOURCE) glossaries: - makeglossaries $(SOURCE) + makeglossaries -d build $(SOURCE) .PHONY: build diff --git a/media/dummy.png b/media/dummy.png new file mode 100644 index 0000000..e723f8f Binary files /dev/null and b/media/dummy.png differ diff --git a/references.bib b/references.bib new file mode 100644 index 0000000..18c9980 --- /dev/null +++ b/references.bib @@ -0,0 +1,841 @@ +@article{acharyaGlobalAnalysisHuman2016, + title = {Global Analysis of Human Duplicated Genes Reveals the Relative Importance of Whole-Genome Duplicates Originated in the Early Vertebrate Evolution}, + author = {Acharya, Debarun and Ghosh, Tapash C.}, + date = {2016-01-22}, + journaltitle = {BMC Genomics}, + shortjournal = {BMC Genomics}, + volume = {17}, + eprint = {26801093}, + eprinttype = {pmid}, + pages = {71}, + issn = {1471-2164}, + doi = {10.1186/s12864-016-2392-0}, + url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4724117/}, + urldate = {2024-03-27}, + abstract = {Background Gene duplication is a genetic mutation that creates functionally redundant gene copies that are initially relieved from selective pressures and may adapt themselves to new functions with time. The levels of gene duplication may vary from small-scale duplication (SSD) to whole genome duplication (WGD). Studies with yeast revealed ample differences between these duplicates: Yeast WGD pairs were functionally more similar, less divergent in subcellular localization and contained a lesser proportion of essential genes. In this study, we explored the differences in evolutionary genomic properties of human SSD and WGD genes, with the identifiable human duplicates coming from the two rounds of whole genome duplication occurred early in vertebrate evolution. Results We observed that these two groups of duplicates were also dissimilar in terms of their evolutionary and genomic properties. But interestingly, this is not like the same observed in yeast. The human WGDs were found to be functionally less similar, diverge more in subcellular level and contain a higher proportion of essential genes than the SSDs, all of which are opposite from yeast. Additionally, we explored that human WGDs were more divergent in their gene expression profile, have higher multifunctionality and are more often associated with disease, and are evolutionarily more conserved than human SSDs. Conclusions Our study suggests that human WGD duplicates are more divergent and entails the adaptation of WGDs to novel and important functions that consequently lead to their evolutionary conservation in the course of evolution. Electronic supplementary material The online version of this article (doi:10.1186/s12864-016-2392-0) contains supplementary material, which is available to authorized users.}, + pmcid = {PMC4724117} +} + +@article{altschulBasicLocalAlignment1990, + title = {Basic Local Alignment Search Tool}, + author = {Altschul, Stephen F. and Gish, Warren and Miller, Webb and Myers, Eugene W. and Lipman, David J.}, + date = {1990-10-05}, + journaltitle = {Journal of Molecular Biology}, + shortjournal = {Journal of Molecular Biology}, + volume = {215}, + number = {3}, + pages = {403--410}, + issn = {0022-2836}, + doi = {10.1016/S0022-2836(05)80360-2}, + url = {https://www.sciencedirect.com/science/article/pii/S0022283605803602}, + urldate = {2023-04-30}, + abstract = {A new approach to rapid sequence comparison, basic local alignment search tool (BLAST), directly approximates alignments that optimize a measure of local similarity, the maximal segment pair (MSP) score. Recent mathematical results on the stochastic properties of MSP scores allow an analysis of the performance of this method as well as the statistical significance of alignments it generates. The basic algorithm is simple and robust; it can be implemented in a number of ways and applied in a variety of contexts including straight-forward DNA and protein sequence database searches, motif searches, gene identification searches, and in the analysis of multiple regions of similarity in long DNA sequences. In addition to its flexibility and tractability to mathematical analysis, BLAST is an order of magnitude faster than existing sequence comparison tools of comparable sensitivity.}, + langid = {english} +} + +@article{beallIdentificationAnalysisHyperactive2002, + title = {Identification and {{Analysis}} of a {{Hyperactive Mutant Form}} of {{Drosophila P-Element Transposase}}}, + author = {Beall, Eileen L and Mahoney, Matthew B and Rio, Donald C}, + date = {2002-09-01}, + journaltitle = {Genetics}, + shortjournal = {Genetics}, + volume = {162}, + number = {1}, + pages = {217--227}, + issn = {1943-2631}, + doi = {10.1093/genetics/162.1.217}, + url = {https://doi.org/10.1093/genetics/162.1.217}, + urldate = {2024-03-25}, + abstract = {Transposition in many organisms is regulated to control the frequency of DNA damage caused by the DNA breakage and joining reactions. However, genetic studies in prokaryotic systems have led to the isolation of mutant transposase proteins with higher or novel activities compared to those of the wild-type protein. In the course of our study of the effects of mutating potential ATM-family DNA damage checkpoint protein kinase sites in the Drosophila P-element transposase protein, we found one mutation, S129A, that resulted in an elevated level of transposase activity using in vivo recombination assays, including P-element-mediated germline transformation. In vitro assays for P-element transposase activity indicate that the S129A mutant exhibits elevated donor DNA cleavage activity when compared to the wild-type protein, whereas the strand-transfer activity is similar to that of wild type. This difference may reflect the nature of the in vitro assays and that normally in vivo the two reactions may proceed in concert. The P-element transposase protein contains 10 potential consensus phosphorylation sites for the ATM family of PI3-related protein kinases. Of these 10 sites, 8 affect transposase activity either positively or negatively when substituted individually with alanine and tested in vivo. A mutant transposase protein that contains all eight N-terminal serine and threonine residues substituted with alanine is inactive and can be restored to full activity by substitution of wild-type amino acids back at only 3 of the 8 positions. These data suggest that the activity of P-element transposase may be regulated by phosphorylation and demonstrate that one mutation, S129A, results in hyperactive transposition.} +} + +@article{blankenbergGalaxyWebbasedGenome2010, + title = {Galaxy: A Web-Based Genome Analysis Tool for Experimentalists}, + shorttitle = {Galaxy}, + author = {Blankenberg, Daniel and Von Kuster, Gregory and Coraor, Nathaniel and Ananda, Guruprasad and Lazarus, Ross and Mangan, Mary and Nekrutenko, Anton and Taylor, James}, + date = {2010-01}, + journaltitle = {Current Protocols in Molecular Biology}, + shortjournal = {Curr Protoc Mol Biol}, + volume = {Chapter 19}, + eprint = {20069535}, + eprinttype = {pmid}, + pages = {Unit 19.10.1-21}, + issn = {1934-3647}, + doi = {10.1002/0471142727.mb1910s89}, + abstract = {High-throughput data production has revolutionized molecular biology. However, massive increases in data generation capacity require analysis approaches that are more sophisticated, and often very computationally intensive. Thus, making sense of high-throughput data requires informatics support. Galaxy (http://galaxyproject.org) is a software system that provides this support through a framework that gives experimentalists simple interfaces to powerful tools, while automatically managing the computational details. Galaxy is distributed both as a publicly available Web service, which provides tools for the analysis of genomic, comparative genomic, and functional genomic data, or a downloadable package that can be deployed in individual laboratories. Either way, it allows experimentalists without informatics or programming expertise to perform complex large-scale analysis with just a Web browser.}, + langid = {english}, + pmcid = {PMC4264107}, + keywords = {Animals,Computational Biology,Genetic Techniques,Genome,Humans,Internet,Software Design} +} + +@misc{bouillonFTAGFinderOutil2016, + title = {{{FTAG Finder}}: {{Un}} Outil Simple Pour Déterminer Les Familles de Gènes et Les Gènes Dupliqués En Tandem Sous {{Galaxy}}}, + author = {Bouillon, Bérengère and Samson, Franck and Birmelé, Etienne and Ponger, Loïc and Rizzon, Carène}, + date = {2016} +} + +@article{buchfinkSensitiveProteinAlignments2021, + title = {Sensitive Protein Alignments at Tree-of-Life Scale Using {{DIAMOND}}}, + author = {Buchfink, Benjamin and Reuter, Klaus and Drost, Hajk-Georg}, + date = {2021-04}, + journaltitle = {Nature Methods}, + shortjournal = {Nat Methods}, + volume = {18}, + number = {4}, + pages = {366--368}, + publisher = {Nature Publishing Group}, + issn = {1548-7105}, + doi = {10.1038/s41592-021-01101-x}, + url = {https://www.nature.com/articles/s41592-021-01101-x}, + urldate = {2024-03-28}, + abstract = {We are at the beginning of a genomic revolution in which all known species are planned to be sequenced. Accessing such data for comparative analyses is crucial in this new age of data-driven biology. Here, we introduce an improved version of DIAMOND that greatly exceeds previous search performances and harnesses supercomputing to perform tree-of-life scale protein alignments in hours, while matching the sensitivity of the gold standard BLASTP.}, + langid = {english}, + keywords = {Computational biology and bioinformatics,Genome informatics,Genomic analysis,Sequencing,Software} +} + +@report{charlesFinalisationPipelineFTAG2023, + type = {Internship Report}, + title = {Finalisation du pipeline FTAG (Families and TAG) Finder, un outil de détection des gènes dupliqués sous Galaxy}, + author = {Charles, Séanna}, + date = {2023}, + institution = {Laboratoire de Mathématiques et Modélisation d'Évry}, + langid = {french} +} + +@article{conantTurningHobbyJob2008, + title = {Turning a Hobby into a Job: {{How}} Duplicated Genes Find New Functions}, + shorttitle = {Turning a Hobby into a Job}, + author = {Conant, Gavin C. and Wolfe, Kenneth H.}, + date = {2008-12}, + journaltitle = {Nature Reviews Genetics}, + shortjournal = {Nat Rev Genet}, + volume = {9}, + number = {12}, + pages = {938--950}, + issn = {1471-0056, 1471-0064}, + doi = {10.1038/nrg2482}, + url = {https://www.nature.com/articles/nrg2482}, + urldate = {2024-03-19}, + abstract = {Gene duplication provides raw material for functional innovation. Recent advances have shed light on two fundamental questions regarding gene duplication: which genes tend to undergo duplication? And how does natural selection subsequently act on them? Genomic data suggest that different gene classes tend to be retained after single-gene and whole-genome duplications. We also know that functional differences between duplicate genes can originate in several different ways, including mutations that directly impart new functions, subdivision of ancestral functions and selection for changes in gene dosage. Interestingly, in many cases the ‘new’ function of one copy is a secondary property that was always present, but that has been co-opted to a primary role after the duplication.}, + langid = {english} +} + +@article{correaTransposableElementEnvironment2021, + title = {The {{Transposable Element Environment}} of {{Human Genes Differs According}} to {{Their Duplication Status}} and {{Essentiality}}}, + author = {Correa, Margot and Lerat, Emmanuelle and Birmelé, Etienne and Samson, Franck and Bouillon, Bérengère and Normand, Kévin and Rizzon, Carène}, + date = {2021-05-01}, + journaltitle = {Genome Biology and Evolution}, + shortjournal = {Genome Biology and Evolution}, + volume = {13}, + number = {5}, + pages = {evab062}, + issn = {1759-6653}, + doi = {10.1093/gbe/evab062}, + url = {https://doi.org/10.1093/gbe/evab062}, + urldate = {2023-09-15}, + abstract = {Transposable elements (TEs) are major components of eukaryotic genomes and represent approximately 45\% of the human genome. TEs can be important sources of novelty in genomes and there is increasing evidence that TEs contribute to the evolution of gene regulation in mammals. Gene duplication is an evolutionary mechanism that also provides new genetic material and opportunities to acquire new functions. To investigate how duplicated genes are maintained in genomes, here, we explored the TE environment of duplicated and singleton genes. We found that singleton genes have more short-interspersed nuclear elements and DNA transposons in their vicinity than duplicated genes, whereas long-interspersed nuclear elements and long-terminal repeat retrotransposons have accumulated more near duplicated genes. We also discovered that this result is highly associated with the degree of essentiality of the genes with an unexpected accumulation of short-interspersed nuclear elements and DNA transposons around the more-essential genes. Our results underline the importance of taking into account the TE environment of genes to better understand how duplicated genes are maintained in genomes.} +} + +@article{correaTransposableElementEnvironment2021a, + title = {The {{Transposable Element Environment}} of {{Human Genes Differs According}} to {{Their Duplication Status}} and {{Essentiality}}}, + author = {Correa, Margot and Lerat, Emmanuelle and Birmelé, Etienne and Samson, Franck and Bouillon, Bérengère and Normand, Kévin and Rizzon, Carène}, + editor = {Pritham, Ellen}, + date = {2021-05-07}, + journaltitle = {Genome Biology and Evolution}, + volume = {13}, + number = {5}, + pages = {evab062}, + issn = {1759-6653}, + doi = {10.1093/gbe/evab062}, + url = {https://academic.oup.com/gbe/article/doi/10.1093/gbe/evab062/6273345}, + urldate = {2024-03-19}, + abstract = {Transposable elements (TEs) are major components of eukaryotic genomes and represent approximately 45\% of the human genome. TEs can be important sources of novelty in genomes and there is increasing evidence that TEs contribute to the evolution of gene regulation in mammals. Gene duplication is an evolutionary mechanism that also provides new genetic material and opportunities to acquire new functions. To investigate how duplicated genes are maintained in genomes, here, we explored the TE environment of duplicated and singleton genes. We found that singleton genes have more short-interspersed nuclear elements and DNA transposons in their vicinity than duplicated genes, whereas long-interspersed nuclear elements and long-terminal repeat retrotransposons have accumulated more near duplicated genes. We also discovered that this result is highly associated with the degree of essentiality of the genes with an unexpected accumulation of short-interspersed nuclear elements and DNA transposons around the more-essential genes. Our results underline the importance of taking into account the TE environment of genes to better understand how duplicated genes are maintained in genomes.}, + langid = {english} +} + +@article{ditommasoNextflowEnablesReproducible2017, + title = {Nextflow Enables Reproducible Computational Workflows}, + author = {Di Tommaso, Paolo and Chatzou, Maria and Floden, Evan W and Barja, Pablo Prieto and Palumbo, Emilio and Notredame, Cedric}, + date = {2017-04}, + journaltitle = {Nature Biotechnology}, + shortjournal = {Nat Biotechnol}, + volume = {35}, + number = {4}, + pages = {316--319}, + issn = {1087-0156, 1546-1696}, + doi = {10.1038/nbt.3820}, + url = {https://www.nature.com/articles/nbt.3820}, + urldate = {2024-03-27}, + langid = {english} +} + +@article{ditommasoNextflowEnablesReproducible2017a, + title = {Nextflow Enables Reproducible Computational Workflows}, + author = {Di Tommaso, Paolo and Chatzou, Maria and Floden, Evan W. and Barja, Pablo Prieto and Palumbo, Emilio and Notredame, Cedric}, + date = {2017-04}, + journaltitle = {Nature Biotechnology}, + shortjournal = {Nat Biotechnol}, + volume = {35}, + number = {4}, + pages = {316--319}, + publisher = {Nature Publishing Group}, + issn = {1546-1696}, + doi = {10.1038/nbt.3820}, + url = {https://www.nature.com/articles/nbt.3820}, + urldate = {2024-03-26}, + langid = {english}, + keywords = {Computational biology and bioinformatics,Data publication and archiving} +} + +@article{djaffardjyDevelopingReusingBioinformatics2023, + title = {Developing and Reusing Bioinformatics Data Analysis Pipelines Using Scientific Workflow Systems}, + author = {Djaffardjy, Marine and Marchment, George and Sebe, Clémence and Blanchet, Raphael and Bellajhame, Khalid and Gaignard, Alban and Lemoine, Frédéric and Cohen-Boulakia, Sarah}, + date = {2023}, + journaltitle = {Computational and Structural Biotechnology Journal}, + volume = {21}, + eprint = {36968012}, + eprinttype = {pmid}, + pages = {2075}, + publisher = {{Research Network of Computational and Structural Biotechnology}}, + doi = {10.1016/j.csbj.2023.03.003}, + url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10030817/}, + urldate = {2024-03-26}, + abstract = {Data analysis pipelines are now established as an effective means for specifying and executing bioinformatics data analysis and experiments. While scripting languages, particularly Python, R and notebooks, are popular and sufficient for developing small-scale ...}, + langid = {english} +} + +@online{DupliquerPourAdapter2020, + title = {Dupliquer pour s’adapter ou comment accélérer l’évolution des plantes ? | CNRS Biologie}, + shorttitle = {Dupliquer pour s’adapter ou comment accélérer l’évolution des plantes ?}, + date = {2020-10-14}, + url = {https://www.insb.cnrs.fr/fr/cnrsinfo/dupliquer-pour-sadapter-ou-comment-accelerer-levolution-des-plantes}, + urldate = {2024-03-25}, + abstract = {Les duplications de portions de chromosomes permettant aux organismes de dupliquer des gènes existants et d’en créer de nouveaux sont bien}, + langid = {french} +} + +@article{emmsOrthoFinderPhylogeneticOrthology2019, + title = {{{OrthoFinder}}: Phylogenetic Orthology Inference for Comparative Genomics}, + shorttitle = {{{OrthoFinder}}}, + author = {Emms, David M. and Kelly, Steven}, + date = {2019-11-14}, + journaltitle = {Genome Biology}, + shortjournal = {Genome Biology}, + volume = {20}, + number = {1}, + pages = {238}, + issn = {1474-760X}, + doi = {10.1186/s13059-019-1832-y}, + url = {https://doi.org/10.1186/s13059-019-1832-y}, + urldate = {2024-03-31}, + abstract = {Here, we present a major advance of the OrthoFinder method. This extends OrthoFinder’s high accuracy orthogroup inference to provide phylogenetic inference of orthologs, rooted gene trees, gene duplication events, the rooted species tree, and comparative genomics statistics. Each output is benchmarked on appropriate real or simulated datasets, and where comparable methods exist, OrthoFinder is equivalent to or outperforms these methods. Furthermore, OrthoFinder is the most accurate ortholog inference method on the Quest for Orthologs benchmark test. Finally, OrthoFinder’s comprehensive phylogenetic analysis is achieved with equivalent speed and scalability to the fastest, score-based heuristic methods. OrthoFinder is available at https://github.com/davidemms/OrthoFinder.}, + keywords = {Comparative genomics,Gene duplication,Gene tree inference,Ortholog inference} +} + +@article{emmsOrthoFinderSolvingFundamental2015, + title = {{{OrthoFinder}}: Solving Fundamental Biases in Whole Genome Comparisons Dramatically Improves Orthogroup Inference Accuracy}, + shorttitle = {{{OrthoFinder}}}, + author = {Emms, David M. and Kelly, Steven}, + date = {2015-08-06}, + journaltitle = {Genome Biology}, + shortjournal = {Genome Biology}, + volume = {16}, + number = {1}, + pages = {157}, + issn = {1474-760X}, + doi = {10.1186/s13059-015-0721-2}, + url = {https://doi.org/10.1186/s13059-015-0721-2}, + urldate = {2024-03-30}, + abstract = {Identifying homology relationships between sequences is fundamental to biological research. Here we provide a novel orthogroup inference algorithm called OrthoFinder that solves a previously undetected gene length bias in orthogroup inference, resulting in significant improvements in accuracy. Using real benchmark datasets we demonstrate that OrthoFinder is more accurate than other orthogroup inference methods by between 8 \% and 33 \%. Furthermore, we demonstrate the utility of OrthoFinder by providing a complete classification of transcription factor gene families in plants revealing 6.9 million previously unobserved relationships.}, + keywords = {Blast Score,Gene Length,Phylogenetic Distance,Sequence Similarity Score,Transcription Factor Gene Family} +} + +@article{gautRecombinationUnderappreciatedFactor2007, + title = {Recombination: An Underappreciated Factor in the Evolution of Plant Genomes}, + shorttitle = {Recombination}, + author = {Gaut, Brandon S. and Wright, Stephen I. and Rizzon, Carène and Dvorak, Jan and Anderson, Lorinda K.}, + date = {2007-01}, + journaltitle = {Nature Reviews Genetics}, + shortjournal = {Nat Rev Genet}, + volume = {8}, + number = {1}, + pages = {77--84}, + issn = {1471-0056, 1471-0064}, + doi = {10.1038/nrg1970}, + url = {https://www.nature.com/articles/nrg1970}, + urldate = {2023-09-15}, + langid = {english} +} + +@article{gibbonsEvaluationBLASTbasedEdgeweighting2015, + title = {Evaluation of {{BLAST-based}} Edge-Weighting Metrics Used for Homology Inference with the {{Markov Clustering}} Algorithm}, + author = {Gibbons, Theodore R. and Mount, Stephen M. and Cooper, Endymion D. and Delwiche, Charles F.}, + date = {2015-12}, + journaltitle = {BMC Bioinformatics}, + shortjournal = {BMC Bioinformatics}, + volume = {16}, + number = {1}, + pages = {218}, + issn = {1471-2105}, + doi = {10.1186/s12859-015-0625-x}, + url = {https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-015-0625-x}, + urldate = {2024-03-19}, + abstract = {Background: Clustering protein sequences according to inferred homology is a fundamental step in the analysis of many large data sets. Since the publication of the Markov Clustering (MCL) algorithm in 2002, it has been the centerpiece of several popular applications. Each of these approaches generates an undirected graph that represents sequences as nodes connected to each other by edges weighted with a BLAST-based metric. MCL is then used to infer clusters of homologous proteins by analyzing these graphs. The various approaches differ only by how they weight the edges, yet there has been very little direct examination of the relative performance of alternative edge-weighting metrics. This study compares the performance of four BLAST-based edge-weighting metrics: the bit score, bit score ratio (BSR), bit score over anchored length (BAL), and negative common log of the expectation value (NLE). Performance is tested using the Extended CEGMA KOGs (ECK) database, which we introduce here. Results: All metrics performed similarly when analyzing full-length sequences, but dramatic differences emerged as progressively larger fractions of the test sequences were split into fragments. The BSR and BAL successfully rescued subsets of clusters by strengthening certain types of alignments between fragmented sequences, but also shifted the largest correct scores down near the range of scores generated from spurious alignments. This penalty outweighed the benefits in most test cases, and was greatly exacerbated by increasing the MCL inflation parameter, making these metrics less robust than the bit score or the more popular NLE. Notably, the bit score performed as well or better than the other three metrics in all scenarios. Conclusions: The results provide a strong case for use of the bit score, which appears to offer equivalent or superior performance to the more popular NLE. The insight that MCL-based clustering methods can be improved using a more tractable edge-weighting metric will greatly simplify future implementations. We demonstrate this with our own minimalist Python implementation: Porthos, which uses only standard libraries and can process a graph with 25 m + edges connecting the 60 k + KOG sequences in half a minute using less than half a gigabyte of memory.}, + langid = {english} +} + +@article{goecksGalaxyComprehensiveApproach2010, + title = {Galaxy: A Comprehensive Approach for Supporting Accessible, Reproducible, and Transparent Computational Research in the Life Sciences}, + shorttitle = {Galaxy}, + author = {Goecks, Jeremy and Nekrutenko, Anton and Taylor, James and {Galaxy Team}}, + date = {2010}, + journaltitle = {Genome Biology}, + shortjournal = {Genome Biol}, + volume = {11}, + number = {8}, + eprint = {20738864}, + eprinttype = {pmid}, + pages = {R86}, + issn = {1474-760X}, + doi = {10.1186/gb-2010-11-8-r86}, + abstract = {Increased reliance on computational approaches in the life sciences has revealed grave concerns about how accessible and reproducible computation-reliant results truly are. Galaxy http://usegalaxy.org, an open web-based platform for genomic research, addresses these problems. Galaxy automatically tracks and manages data provenance and provides support for capturing the context and intent of computational methods. Galaxy Pages are interactive, web-based documents that provide users with a medium to communicate a complete computational analysis.}, + langid = {english}, + pmcid = {PMC2945788}, + keywords = {Algorithms,Animals,Computational Biology,Databases Nucleic Acid,Genomics,Humans,Internet} +} + +@article{golovninaMolecularPhylogenyGenus2007a, + title = {Molecular Phylogeny of the Genus {{Triticum L}}}, + author = {Golovnina, K. A. and Glushkov, S. A. and Blinov, A. G. and Mayorov, V. I. and Adkison, L. R. and Goncharov, N. P.}, + date = {2007-04-01}, + journaltitle = {Plant Systematics and Evolution}, + shortjournal = {Plant Syst. Evol.}, + volume = {264}, + number = {3}, + pages = {195--216}, + issn = {1615-6110}, + doi = {10.1007/s00606-006-0478-x}, + url = {https://doi.org/10.1007/s00606-006-0478-x}, + urldate = {2024-03-27}, + abstract = {The genus Triticum L. includes the major cereal crop, common or bread wheat (hexaploid Triticum aestivum L.), and other important cultivated species. Here, we conducted a phylogenetic analysis of all known wheat species and the closely related Aegilops species. This analysis was based on chloroplast matK gene comparison along with trnL intron sequences of some species. Polyploid wheat species are successfully divided only into two groups – Emmer (sections Dicoccoides and Triticum) and Timopheevii (section Timopheevii). Results reveal strictly maternal plastid inheritance of synthetic wheat amphiploids included in the study. A concordance of chloroplast origin with the definite nuclear genomes of polyploid species that were inherited at the last hybridization events was found. Our analysis suggests that there were two ancestral representatives of Aegilops speltoides Tausch that participated in the speciation of polyploid wheats with B and G genome in their genome composition. However, G genome species are younger in evolution than ones with B genome. B genome-specific PCR primers were developed for amplification of Acc-1 gene.}, + langid = {english}, + keywords = {Aegilops,molecular evolution,plasmon and B genome inheritance,Triticum,wheat} +} + +@online{HomeCromwell, + title = {Home - {{Cromwell}}}, + url = {https://cromwell.readthedocs.io/en/stable/}, + urldate = {2024-03-27} +} + +@video{javiernovoDuplicationGenes2015, + entrysubtype = {video}, + title = {Duplication of Genes}, + editor = {{Javier Novo}}, + editortype = {director}, + date = {2015}, + url = {https://www.youtube.com/watch?v=CW1tojSWPxA}, + urldate = {2024-03-27}, + abstract = {Video 4 of the third Unit of the MOOC on Genome Evolution. Paralogs and orthologs. Neo-functionalization and subfunctionalization.} +} + +@article{johnsonHiddenMarkovModel2010, + title = {Hidden {{Markov}} Model Speed Heuristic and Iterative {{HMM}} Search Procedure}, + author = {Johnson, L. Steven and Eddy, Sean R. and Portugaly, Elon}, + date = {2010-08-18}, + journaltitle = {BMC Bioinformatics}, + shortjournal = {BMC Bioinformatics}, + volume = {11}, + number = {1}, + pages = {431}, + issn = {1471-2105}, + doi = {10.1186/1471-2105-11-431}, + url = {https://doi.org/10.1186/1471-2105-11-431}, + urldate = {2024-04-09}, + abstract = {Profile hidden Markov models (profile-HMMs) are sensitive tools for remote protein homology detection, but the main scoring algorithms, Viterbi or Forward, require considerable time to search large sequence databases.}, + keywords = {Entropy Weighting,Iterative Search,Profile Hide Markov Model,Search Time,Test Database} +} + +@article{kosterSnakemakeScalableBioinformatics2012, + title = {Snakemake--a Scalable Bioinformatics Workflow Engine}, + author = {Köster, Johannes and Rahmann, Sven}, + date = {2012-10-01}, + journaltitle = {Bioinformatics (Oxford, England)}, + shortjournal = {Bioinformatics}, + volume = {28}, + number = {19}, + eprint = {22908215}, + eprinttype = {pmid}, + pages = {2520--2522}, + issn = {1367-4811}, + doi = {10.1093/bioinformatics/bts480}, + abstract = {SUMMARY: Snakemake is a workflow engine that provides a readable Python-based workflow definition language and a powerful execution environment that scales from single-core workstations to compute clusters without modifying the workflow. It is the first system to support the use of automatically inferred multiple named wildcards (or variables) in input and output filenames. AVAILABILITY: http://snakemake.googlecode.com. CONTACT: johannes.koester@uni-due.de.}, + langid = {english}, + keywords = {Computational Biology,Electronic Data Processing,Programming Languages,Software,Workflow} +} + +@online{kursNextflowWorkbenchReproducibleReusable2016, + title = {{{NextflowWorkbench}}: {{Reproducible}} and {{Reusable Workflows}} for {{Beginners}} and {{Experts}}}, + shorttitle = {{{NextflowWorkbench}}}, + author = {Kurs, Jason P. and Simi, Manuele and Campagne, Fabien}, + date = {2016-03-28}, + eprinttype = {bioRxiv}, + eprintclass = {New Results}, + pages = {041236}, + doi = {10.1101/041236}, + url = {https://www.biorxiv.org/content/10.1101/041236v2}, + urldate = {2024-03-26}, + abstract = {Computational workflows and pipelines are often created to automate series of processing steps. For instance, workflows enable one to standardize analysis for large projects or core facilities, but are also useful for individual biologists who need to perform repetitive data processing. Some workflow systems, designed for beginners, offer a graphical user interface and have been very popular with biologists. In practice, these tools are infrequently used by more experienced bioinformaticians, who may require more flexibility or performance than afforded by the user interfaces, and seem to prefer developing workflows with scripting or command line tools. Here, we present a workflow system, the NextflowWorkbench (NW), which was designed for both beginners and experts, and blends the distinction between user interface and scripting language. This system extends and reuses the popular Nextflow workflow description language and shares its advantages. In contrast to Nextflow, NextflowWorkbench offers an integrated development environment that helps complete beginners get started with workflow development. Auto-completion helps beginners who do not know the syntax of the Nextflow language. Reusable processes provide modular workflows. Programmers will benefit from unique interactive features that help users work more productively with docker containers. We illustrate this tool with a workflow to estimate RNA-Seq counts using Kallisto. We found that beginners can be taught how to assemble this workflow in a two hours training session. NW workflows are portable and can execute on laptop/desktop computers with docker, on a lab cluster, or in the cloud to facilitate training. NextflowWorkbench is open-source and available at http://workflow.campagnelab.org.}, + langid = {english}, + pubstate = {preprint} +} + +@thesis{lallemandEvolutionGenesDupliques2022, + type = {phdthesis}, + title = {Évolution des gènes dupliqués chez le pommier : Identification et caractérisation de la dominance du sous-génome dans le génome de la pomme}, + shorttitle = {Évolution des gènes dupliqués chez le pommier}, + author = {Lallemand, Tanguy}, + date = {2022-11-15}, + institution = {Université d'Angers}, + url = {https://theses.hal.science/tel-04081238}, + urldate = {2024-03-30}, + abstract = {Un événement de duplication du génome entier (WGD) s’est produit chez l’ancêtre du pommier (Malus domestica). Les événements de WGD ont un impact profond sur les génomes et sont connus pour être des moteurs majeurs de l’évolution. Cette WGD est relativement récente (27 Millions d’années) et fait du pommier un organisme de choix pour étudier le devenir des gènes dupliqués par autopolyploïdisation. Dans cette étude, nous avons examiné l’évolution des fragments chromosomiques dupliqués, sous le prisme d’analyses génomiques, transcriptomiques et épigénétiques. Nous avons identifié 16 779 paires de gènes dupliqués dans le génome du pommier, confirmant le caractère récent de la WGD. Les gènes au sein des paires ohnologues ne semblent pas soumis à des pressions de sélection différentes. Nous avons montré plusieurs déséquilibres dans la proportion de QTLs cartographiés entre fragments chromosomiques dupliqués, et caractérisé divers biais dans le fractionnement du génome, le niveau d’expression des gènes, la couverture en éléments transposables et la méthylation de l’ADN. Nos résultats suggèrent une dominance sous-chromosomique dans cet autopolyploïde, un phénomène proche de la sous dominance génomique décrite jusqu’à présent uniquement chez les allopolyploïdes.}, + langid = {french} +} + +@article{lallemandInsightsEvolutionOhnologous2023, + title = {Insights into the {{Evolution}} of {{Ohnologous Sequences}} and {{Their Epigenetic Marks Post-WGD}} in {{Malus Domestica}}}, + author = {Lallemand, Tanguy and Leduc, Martin and Desmazières, Adèle and Aubourg, Sébastien and Rizzon, Carène and Landès, Claudine and Celton, Jean-Marc}, + date = {2023-10}, + journaltitle = {Genome Biology and Evolution}, + volume = {15}, + number = {10}, + eprint = {37847638}, + eprinttype = {pmid}, + publisher = {Oxford University Press}, + doi = {10.1093/gbe/evad178}, + url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC10601995/}, + urldate = {2024-03-30}, + abstract = {A Whole Genome Duplication (WGD) event occurred several Ma in a Rosaceae ancestor, giving rise to the Maloideae subfamily which includes today many pome fruits such as pear (Pyrus communis) and apple (Malus domestica). This complete and well-conserved ...}, + langid = {english} +} + +@article{lallemandOverviewDuplicatedGene2020, + title = {An {{Overview}} of {{Duplicated Gene Detection Methods}}: {{Why}} the {{Duplication Mechanism Has}} to {{Be Accounted}} for in {{Their Choice}}}, + shorttitle = {An {{Overview}} of {{Duplicated Gene Detection Methods}}}, + author = {Lallemand, Tanguy and Leduc, Martin and Landès, Claudine and Rizzon, Carène and Lerat, Emmanuelle}, + date = {2020-09-04}, + journaltitle = {Genes}, + shortjournal = {Genes}, + volume = {11}, + number = {9}, + pages = {1046}, + issn = {2073-4425}, + doi = {10.3390/genes11091046}, + url = {https://www.mdpi.com/2073-4425/11/9/1046}, + urldate = {2024-03-19}, + abstract = {Gene duplication is an important evolutionary mechanism allowing to provide new genetic material and thus opportunities to acquire new gene functions for an organism, with major implications such as speciation events. Various processes are known to allow a gene to be duplicated and different models explain how duplicated genes can be maintained in genomes. Due to their particular importance, the identification of duplicated genes is essential when studying genome evolution but it can still be a challenge due to the various fates duplicated genes can encounter. In this review, we first describe the evolutionary processes allowing the formation of duplicated genes but also describe the various bioinformatic approaches that can be used to identify them in genome sequences. Indeed, these bioinformatic approaches differ according to the underlying duplication mechanism. Hence, understanding the specificity of the duplicated genes of interest is a great asset for tool selection and should be taken into account when exploring a biological question.}, + langid = {english} +} + +@article{lannesDoesPresenceTransposable2019, + title = {Does the {{Presence}} of {{Transposable Elements Impact}} the {{Epigenetic Environment}} of {{Human Duplicated Genes}}?}, + author = {Lannes, Romain and Rizzon, Carène and Lerat, Emmanuelle}, + date = {2019-03-26}, + journaltitle = {Genes}, + shortjournal = {Genes (Basel)}, + volume = {10}, + number = {3}, + eprint = {30917603}, + eprinttype = {pmid}, + pages = {249}, + issn = {2073-4425}, + doi = {10.3390/genes10030249}, + url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6470583/}, + urldate = {2023-09-15}, + abstract = {Epigenetic modifications have an important role to explain part of the intra- and inter-species variation in gene expression. They also have a role in the control of transposable elements (TEs) whose activity may have a significant impact on genome evolution by promoting various mutations, which are expected to be mostly deleterious. A change in the local epigenetic landscape associated with the presence of TEs is expected to affect the expression of neighboring genes since these modifications occurring at TE sequences can spread to neighboring sequences. In this work, we have studied how the epigenetic modifications of genes are conserved and what the role of TEs is in this conservation. For that, we have compared the conservation of the epigenome associated with human duplicated genes and the differential presence of TEs near these genes. Our results show higher epigenome conservation of duplicated genes from the same family when they share similar TE environment, suggesting a role for the differential presence of TEs in the evolutionary divergence of duplicates through variation in the epigenetic landscape.}, + pmcid = {PMC6470583} +} + +@thesis{leducEtudeEvolutionGenes, + title = {Étude de l’évolution des gènes dupliqués chez les Rosaceae}, + author = {Leduc, Martin}, + langid = {french} +} + +@article{leitchGenomicPlasticityDiversity2008, + title = {Genomic Plasticity and the Diversity of Polyploid Plants}, + author = {Leitch, A. R. and Leitch, I. J.}, + date = {2008-04-25}, + journaltitle = {Science (New York, N.Y.)}, + shortjournal = {Science}, + volume = {320}, + number = {5875}, + eprint = {18436776}, + eprinttype = {pmid}, + pages = {481--483}, + issn = {1095-9203}, + doi = {10.1126/science.1153585}, + abstract = {Polyploidy, a change whereby the entire chromosome set is multiplied, arises through mitotic or meiotic misdivisions and frequently involves unreduced gametes and interspecific hybridization. The success of newly formed angiosperm polyploids is partly attributable to their highly plastic genome structure, as manifested by tolerance to changing chromosome numbers (aneuploidy and polyploidy), genome size, (retro)transposable element mobility, insertions, deletions, and epigenome restructuring. The ability to withstand large-scale changes, frequently within one or a few generations, is associated with a restructuring of the transcriptome, metabolome, and proteome and can result in an altered phenotype and ecology. Thus, polyploid-induced changes can generate individuals that are able to exploit new niches or to outcompete progenitor species. This process has been a major driving force behind the divergence of the angiosperms and their biodiversity.}, + langid = {english}, + keywords = {Biodiversity,Biological Evolution,Chromosomes Plant,Genetic Speciation,Genetic Variation,Genome Plant,Hybridization Genetic,Magnoliopsida,Nondisjunction Genetic,Plant Proteins,Polyploidy,Proteome,Transcription Genetic} +} + +@article{longGeneDuplicationEvolution2001, + title = {Gene {{Duplication}} and {{Evolution}}}, + author = {Long, Manyuan and Thornton, Kevin}, + date = {2001-08-31}, + journaltitle = {Science}, + volume = {293}, + number = {5535}, + pages = {1551--1551}, + publisher = {American Association for the Advancement of Science}, + doi = {10.1126/science.293.5535.1551a}, + url = {https://www.science.org/doi/abs/10.1126/science.293.5535.1551a}, + urldate = {2024-03-28} +} + +@article{lynchEvolutionaryFateConsequences2000, + title = {The {{Evolutionary Fate}} and {{Consequences}} of {{Duplicate Genes}}}, + author = {Lynch, Michael and Conery, John S.}, + date = {2000-11-10}, + journaltitle = {Science}, + volume = {290}, + number = {5494}, + pages = {1151--1155}, + publisher = {American Association for the Advancement of Science}, + doi = {10.1126/science.290.5494.1151}, + url = {https://www.science.org/doi/abs/10.1126/science.290.5494.1151}, + urldate = {2024-03-28}, + abstract = {Gene duplication has generally been viewed as a necessary source of material for the origin of evolutionary novelties, but it is unclear how often gene duplicates arise and how frequently they evolve new functions. Observations from the genomic databases for several eukaryotic species suggest that duplicate genes arise at a very high rate, on average 0.01 per gene per million years. Most duplicated genes experience a brief period of relaxed selection early in their history, with a moderate fraction of them evolving in an effectively neutral manner during this period. However, the vast majority of gene duplicates are silenced within a few million years, with the few survivors subsequently experiencing strong purifying selection. Although duplicate genes may only rarely evolve new functions, the stochastic silencing of such genes may play a significant role in the passive origin of new species.} +} + +@article{maoGenoDupPipelineTool2019, + title = {{{GenoDup Pipeline}}: A Tool to Detect Genome Duplication Using the {{dS-based}} Method}, + shorttitle = {{{GenoDup Pipeline}}}, + author = {Mao, Yafei}, + date = {2019-01-23}, + journaltitle = {PeerJ}, + shortjournal = {PeerJ}, + volume = {7}, + eprint = {30697488}, + eprinttype = {pmid}, + pages = {e6303}, + issn = {2167-8359}, + doi = {10.7717/peerj.6303}, + url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6347962/}, + urldate = {2024-03-21}, + abstract = {Understanding whole genome duplication (WGD), or polyploidy, is fundamental to investigating the origin and diversification of organisms in evolutionary biology. The wealth of genomic data generated by next generation sequencing (NGS) has resulted in an urgent need for handy and accurate tools to detect WGD. Here, I present a useful and user-friendly pipeline called GenoDup for inferring WGD using the dS-based method. I have successfully applied GenoDup to identify WGD in empirical data from both plants and animals. The GenoDup Pipeline provides a reliable and useful tool to infer WGD from NGS data.}, + pmcid = {PMC6347962} +} + +@online{molderSustainableDataAnalysis2021a, + title = {Sustainable Data Analysis with {{Snakemake}}}, + author = {Mölder, Felix and Jablonski, Kim Philipp and Letcher, Brice and Hall, Michael B. and Tomkins-Tinch, Christopher H. and Sochat, Vanessa and Forster, Jan and Lee, Soohyun and Twardziok, Sven O. and Kanitz, Alexander and Wilm, Andreas and Holtgrewe, Manuel and Rahmann, Sven and Nahnsen, Sven and Köster, Johannes}, + date = {2021-04-19}, + number = {10:33}, + eprint = {10:33}, + eprinttype = {F1000Research}, + doi = {10.12688/f1000research.29032.2}, + url = {https://f1000research.com/articles/10-33}, + urldate = {2024-03-26}, + abstract = {Data analysis often entails a multitude of heterogeneous steps, from the application of various command line tools to the usage of scripting languages like R or Python for the generation of plots and tables. It is widely recognized that data analyses should ideally be conducted in a reproducible way.\ Reproducibility enables technical validation and regeneration of results on the original or even new data. However, reproducibility alone is by no means sufficient to deliver an analysis that is of lasting impact (i.e., sustainable) for the field, or even just one research group. We postulate that it is equally important to ensure adaptability and transparency. The former describes the ability to modify the analysis to answer extended or slightly different research questions. The latter describes the ability to understand the analysis in order to judge whether it is not only technically, but methodologically valid. Here, we analyze the properties needed for a data analysis to become reproducible, adaptable, and transparent. We show how the popular workflow management system Snakemake can be used to guarantee this, and how it enables an ergonomic, combined, unified representation of all steps involved in data analysis, ranging from raw data processing, to quality control and fine-grained, interactive exploration and plotting of final results.}, + langid = {english}, + pubstate = {preprint}, + keywords = {adaptability,data analysis,reproducibility,scalability,sustainability,transparency,workflow management} +} + +@article{nozawaEvolutionaryDynamicsOlfactory2007, + title = {Evolutionary Dynamics of Olfactory Receptor Genes in {{Drosophila}} Species}, + author = {Nozawa, Masafumi and Nei, Masatoshi}, + date = {2007-04-24}, + journaltitle = {Proceedings of the National Academy of Sciences}, + volume = {104}, + number = {17}, + pages = {7122--7127}, + publisher = {Proceedings of the National Academy of Sciences}, + doi = {10.1073/pnas.0702133104}, + url = {https://www.pnas.org/doi/full/10.1073/pnas.0702133104}, + urldate = {2024-04-02}, + abstract = {Olfactory receptor (OR) genes are of vital importance for animals to find food, identify mates, and avoid dangers. In mammals, the number of OR genes is large and varies extensively among different orders, whereas, in insects, the extent of interspecific variation appears to be small, although only a few species have been studied. To understand the evolutionary changes of OR genes, we identified all OR genes from 12 Drosophila species, of which the evolutionary time is roughly equivalent to that of eutherian mammals. The results showed that all species examined have similar numbers (≈60) of functional OR genes. Phylogenetic analysis indicated that the ancestral species also had similar numbers of genes, but there were frequent gains and losses of genes that occurred in each evolutionary lineage. It appears that tandem duplication and random inactivation of duplicate genes are the major factors of gene number change. However, chromosomal rearrangements have contributed to the establishment of genome-wide distribution of OR genes. These results suggest that the repertoire of OR genes in Drosophila has been quite stable compared with the mammalian genes. The difference in evolutionary pattern between Drosophila and mammals can be explained partly by the differences of gene expression mechanisms and partly by the environmental and behavioral differences.} +} + +@book{ohnoEvolutionGeneDuplication1970, + title = {Evolution by {{Gene Duplication}}}, + author = {Ohno, Susumu}, + date = {1970}, + publisher = {Springer Berlin Heidelberg}, + location = {Berlin, Heidelberg}, + doi = {10.1007/978-3-642-86659-3}, + url = {http://link.springer.com/10.1007/978-3-642-86659-3}, + urldate = {2024-03-21}, + isbn = {978-3-642-86661-6}, + langid = {english} +} + +@online{PEPkitBioData, + title = {{{PEPkit}}: The Bio Data Management Toolkit - {{PEPkit}}: The Bio Data Management Toolkit}, + url = {https://pep.databio.org/}, + urldate = {2024-03-27} +} + +@article{picart-picoloLargeTandemDuplications2020, + title = {Large Tandem Duplications Affect Gene Expression, {{3D}} Organization, and Plant–Pathogen Response}, + author = {Picart-Picolo, Ariadna and Grob, Stefan and Picault, Nathalie and Franek, Michal and Llauro, Christel and Halter, Thierry and Maier, Tom R. and Jobet, Edouard and Descombin, Julie and Zhang, Panpan and Paramasivan, Vijayapalani and Baum, Thomas J. and Navarro, Lionel and Dvořáčková, Martina and Mirouze, Marie and Pontvianne, Frédéric}, + date = {2020-10-08}, + journaltitle = {Genome Research}, + shortjournal = {Genome Res.}, + eprint = {33033057}, + eprinttype = {pmid}, + publisher = {Cold Spring Harbor Lab}, + issn = {1088-9051, 1549-5469}, + doi = {10.1101/gr.261586.120}, + url = {https://genome.cshlp.org/content/early/2020/10/05/gr.261586.120}, + urldate = {2024-03-25}, + abstract = {Rapid plant genome evolution is crucial to adapt to environmental changes. Chromosomal rearrangements and gene copy number variation (CNV) are two important tools for genome evolution and sources for the creation of new genes. However, their emergence takes many generations. In this study, we show that in Arabidopsis thaliana, a significant loss of ribosomal RNA (rRNA) genes with a past history of a mutation for the chromatin assembly factor 1 (CAF1) complex causes rapid changes in the genome structure. Using long-read sequencing and microscopic approaches, we have identified up to 15 independent large tandem duplications in direct orientation (TDDOs) ranging from 60 kb to 1.44 Mb. Our data suggest that these TDDOs appeared within a few generations, leading to the duplication of hundreds of genes. By subsequently focusing on a line only containing 20\% of rRNA gene copies (20rDNA line), we investigated the impact of TDDOs on 3D genome organization, gene expression, and cytosine methylation. We found that duplicated genes often accumulate more transcripts. Among them, several are involved in plant–pathogen response, which could explain why the 20rDNA line is hyper-resistant to both bacterial and nematode infections. Finally, we show that the TDDOs create gene fusions and/or truncations and discuss their potential implications for the evolution of plant genomes.}, + langid = {english} +} + +@online{ponsComputingCommunitiesLarge2005, + title = {Computing Communities in Large Networks Using Random Walks (Long Version)}, + author = {Pons, Pascal and Latapy, Matthieu}, + date = {2005-12-12}, + eprint = {physics/0512106}, + eprinttype = {arxiv}, + doi = {10.48550/arXiv.physics/0512106}, + url = {http://arxiv.org/abs/physics/0512106}, + urldate = {2024-03-30}, + abstract = {Dense subgraphs of sparse graphs (communities), which appear in most real-world complex networks, play an important role in many contexts. Computing them however is generally expensive. We propose here a measure of similarities between vertices based on random walks which has several important advantages: it captures well the community structure in a network, it can be computed efficiently, and it can be used in an agglomerative algorithm to compute efficiently the community structure of a network. We propose such an algorithm, called Walktrap, which runs in time O(mn\^{}2) and space O(n\^{}2) in the worst case, and in time O(n\^{}2log n) and space O(n\^{}2) in most real-world cases (n and m are respectively the number of vertices and edges in the input graph). Extensive comparison tests show that our algorithm surpasses previously proposed ones concerning the quality of the obtained community structures and that it stands among the best ones concerning the running time.}, + pubstate = {preprint}, + keywords = {Condensed Matter - Disordered Systems and Neural Networks,Condensed Matter - Statistical Mechanics,Physics - Physics and Society} +} + +@article{reamsSelectionGeneClustering2004, + title = {Selection for {{Gene Clustering}} by {{Tandem Duplication}}}, + author = {Reams, Andrew B. and Neidle, Ellen L.}, + date = {2004-10-01}, + journaltitle = {Annual Review of Microbiology}, + shortjournal = {Annu. Rev. Microbiol.}, + volume = {58}, + number = {1}, + pages = {119--142}, + issn = {0066-4227, 1545-3251}, + doi = {10.1146/annurev.micro.58.030603.123806}, + url = {https://www.annualreviews.org/doi/10.1146/annurev.micro.58.030603.123806}, + urldate = {2024-03-28}, + abstract = {▪ Abstract\enspace{} In prokaryotic genomes, related genes are frequently clustered in operons and higher-order arrangements that reflect functional context. Organization emerges despite rearrangements that constantly shuffle gene and operon order. Evidence is presented that the tandem duplication of related genes acts as a driving evolutionary force in the origin and maintenance of clusters. Gene amplification can be viewed as a dynamic and reversible regulatory mechanism that facilitates adaptation to variable environments. Clustered genes confer selective benefits via their ability to be coamplified. During evolution, rearrangements that bring together related genes can be selected if they increase the fitness of the organism in which they reside. Similarly, the benefits of gene amplification can prevent the dispersal of existing clusters. Examples of frequent and spontaneous amplification of large genomic fragments are provided. The possibility is raised that tandem gene duplication works in concert with horizontal gene transfer as interrelated evolutionary forces for gene clustering.}, + langid = {english} +} + +@article{rizzonRizzonMaraisGouy2002, + title = {Rizzon {{C}}, {{Marais G}}, {{Gouy M}}, {{Biemont C}}. {{Recombination}} Rate and the Distribution of Transposable Elements in the {{Drosophila}} Melanogaster Genome. {{Genome Res}} 12: 400-407}, + shorttitle = {Rizzon {{C}}, {{Marais G}}, {{Gouy M}}, {{Biemont C}}. {{Recombination}} Rate and the Distribution of Transposable Elements in the {{Drosophila}} Melanogaster Genome. {{Genome Res}} 12}, + author = {Rizzon, Carène and Marais, Gabriel and Gouy, Manolo and Biémont, Christian}, + date = {2002-04-01}, + journaltitle = {Genome research}, + shortjournal = {Genome research}, + volume = {12}, + pages = {400--7}, + doi = {10.1101/gr.210802}, + abstract = {We analyzed the distribution of 54 families of transposable elements (TEs; transposons, LTR retrotransposons, and non-LTR retrotransposons) in the chromosomes of Drosophila melanogaster, using data from the sequenced genome. The density of LTR and non-LTR retrotransposons (RNA-based elements) was high in regions with low recombination rates, but there was no clear tendency to parallel the recombination rate. However, the density of transposons (DNA-based elements) was significantly negatively correlated with recombination rate. The accumulation of TEs in regions of reduced recombination rate is compatible with selection acting against TEs, as selection is expected to be weaker in regions with lower recombination. The differences in the relationship between recombination rate and TE density that exist between chromosome arms suggest that TE distribution depends on specific characteristics of the chromosomes (chromatin structure, distribution of other sequences), the TEs themselves (transposition mechanism), and the species (reproductive system, effective population size, etc.), that have differing influences on the effect of natural selection acting against the TE insertions.} +} + +@article{rognesParAlignParallelSequence2001, + title = {{{ParAlign}}: A Parallel Sequence Alignment Algorithm for Rapid and Sensitive Database Searches}, + shorttitle = {{{ParAlign}}}, + author = {Rognes, Torbjørn}, + date = {2001-04-01}, + journaltitle = {Nucleic Acids Research}, + shortjournal = {Nucleic Acids Research}, + volume = {29}, + number = {7}, + pages = {1647--1652}, + issn = {0305-1048}, + doi = {10.1093/nar/29.7.1647}, + url = {https://doi.org/10.1093/nar/29.7.1647}, + urldate = {2024-04-09}, + abstract = {There is a need for faster and more sensitive algorithms for sequence similarity searching in view of the rapidly increasing amounts of genomic sequence data available. Parallel processing capabilities in the form of the single instruction, multiple data (SIMD) technology are now available in common microprocessors and enable a single microprocessor to perform many operations in parallel. The ParAlign algorithm has been specifically designed to take advantage of this technology. The new algorithm initially exploits parallelism to perform a very rapid computation of the exact optimal ungapped alignment score for all diagonals in the alignment matrix. Then, a novel heuristic is employed to compute an approximate score of a gapped alignment by combining the scores of several diagonals. This approximate score is used to select the most interesting database sequences for a subsequent Smith–Waterman alignment, which is also parallelised. The resulting method represents a substantial improvement compared to existing heuristics. The sensitivity and specificity of ParAlign was found to be as good as Smith–Waterman implementations when the same method for computing the statistical significance of the matches was used. In terms of speed, only the significantly less sensitive NCBI BLAST 2 program was found to outperform the new approach. Online searches are available at http://dna.uio.no/search/} +} + +@article{rognesSixfoldSpeedupSmith2000, + title = {Six-Fold Speed-up of {{Smith}}–{{Waterman}} Sequence Database Searches Using Parallel Processing on Common Microprocessors}, + author = {Rognes, Torbjørn and Seeberg, Erling}, + date = {2000-08-01}, + journaltitle = {Bioinformatics}, + shortjournal = {Bioinformatics}, + volume = {16}, + number = {8}, + pages = {699--706}, + issn = {1367-4803}, + doi = {10.1093/bioinformatics/16.8.699}, + url = {https://doi.org/10.1093/bioinformatics/16.8.699}, + urldate = {2024-04-09}, + abstract = {Motivation: Sequence database searching is among the most important and challenging tasks in bioinformatics. The ultimate choice of sequence-search algorithm is that of Smith–Waterman. However, because of the computationally demanding nature of this method, heuristic programs or special-purpose hardware alternatives have been developed. Increased speed has been obtained at the cost of reduced sensitivity or very expensive hardware.Results: A fast implementation of the Smith–Waterman sequence-alignment algorithm using Single-Instruction, Multiple-Data (SIMD) technology is presented. This implementation is based on the MultiMedia eXtensions (MMX) and Streaming SIMD Extensions (SSE) technology that is embedded in Intel’s latest microprocessors. Similar technology exists also in other modern microprocessors. Six-fold speed-up relative to the fastest previously known Smith–Waterman implementation on the same hardware was achieved by an optimized 8-way parallel processing approach. A speed of more than 150 million cell updates per second was obtained on a single Intel Pentium III 500 MHz microprocessor. This is probably the fastest implementation of this algorithm on a single general-purpose microprocessor described to date.Availability: Online searches with the software are available at http://dna.uio.no/search/Contact: torbjorn.rognes@labmed.uio.noTo whom correspondence should be addressed.} +} + +@article{saeboPARALIGNRapidSensitive2005, + title = {{{PARALIGN}}: Rapid and Sensitive Sequence Similarity Searches Powered by Parallel Computing Technology}, + shorttitle = {{{PARALIGN}}}, + author = {Sæbø, Per Eystein and Andersen, Sten Morten and Myrseth, Jon and Laerdahl, Jon K. and Rognes, Torbjørn}, + date = {2005-07-01}, + journaltitle = {Nucleic Acids Research}, + shortjournal = {Nucleic Acids Research}, + volume = {33}, + pages = {W535-W539}, + issn = {0305-1048}, + doi = {10.1093/nar/gki423}, + url = {https://doi.org/10.1093/nar/gki423}, + urldate = {2024-04-09}, + abstract = {PARALIGN is a rapid and sensitive similarity search tool for the identification of distantly related sequences in both nucleotide and amino acid sequence databases. Two algorithms are implemented, accelerated Smith–Waterman and ParAlign. The ParAlign algorithm is similar to Smith–Waterman in sensitivity, while as quick as BLAST for protein searches. A form of parallel computing technology known as multimedia technology that is available in modern processors, but rarely used by other bioinformatics software, has been exploited to achieve the high speed. The software is also designed to run efficiently on computer clusters using the message-passing interface standard. A public search service powered by a large computer cluster has been set-up and is freely available at www.paralign.org , where the major public databases can be searched. The software can also be downloaded free of charge for academic use.}, + issue = {suppl\_2} +} + +@article{smithIdentificationCommonMolecular1981, + title = {Identification of Common Molecular Subsequences}, + author = {Smith, T. F. and Waterman, M. S.}, + date = {1981-03-25}, + journaltitle = {Journal of Molecular Biology}, + shortjournal = {Journal of Molecular Biology}, + volume = {147}, + number = {1}, + pages = {195--197}, + issn = {0022-2836}, + doi = {10.1016/0022-2836(81)90087-5}, + url = {https://www.sciencedirect.com/science/article/pii/0022283681900875}, + urldate = {2023-04-29}, + langid = {english} +} + +@article{suyamaPAL2NALRobustConversion2006, + title = {{{PAL2NAL}}: Robust Conversion of Protein Sequence Alignments into the Corresponding Codon Alignments}, + shorttitle = {{{PAL2NAL}}}, + author = {Suyama, Mikita and Torrents, David and Bork, Peer}, + date = {2006-07-01}, + journaltitle = {Nucleic Acids Research}, + shortjournal = {Nucleic Acids Research}, + volume = {34}, + pages = {W609-W612}, + issn = {0305-1048}, + doi = {10.1093/nar/gkl315}, + url = {https://doi.org/10.1093/nar/gkl315}, + urldate = {2024-03-31}, + abstract = {PAL2NAL is a web server that constructs a multiple codon alignment from the corresponding aligned protein sequences. Such codon alignments can be used to evaluate the type and rate of nucleotide substitutions in coding DNA for a wide range of evolutionary analyses, such as the identification of levels of selective constraint acting on genes, or to perform DNA-based phylogenetic studies. The server takes a protein sequence alignment and the corresponding DNA sequences as input. In contrast to other existing applications, this server is able to construct codon alignments even if the input DNA sequence has mismatches with the input protein sequence, or contains untranslated regions and polyA tails. The server can also deal with frame shifts and inframe stop codons in the input models, and is thus suitable for the analysis of pseudogenes. Another distinct feature is that the user can specify a subregion of the input alignment in order to specifically analyze functional domains or exons of interest. The PAL2NAL server is available at http://www.bork.embl.de/pal2nal .}, + issue = {suppl\_2} +} + +@article{taylorDuplicationDivergenceEvolution2004, + title = {Duplication and {{Divergence}}: {{The Evolution}} of {{New Genes}} and {{Old Ideas}}}, + shorttitle = {Duplication and {{Divergence}}}, + author = {Taylor, John S. and Raes, Jeroen}, + date = {2004-12-15}, + journaltitle = {Annual Review of Genetics}, + volume = {38}, + pages = {615--643}, + publisher = {Annual Reviews}, + issn = {0066-4197, 1545-2948}, + doi = {10.1146/annurev.genet.38.072902.092831}, + url = {https://www.annualreviews.org/content/journals/10.1146/annurev.genet.38.072902.092831}, + urldate = {2024-03-25}, + abstract = {▪ Abstract Over 35 years ago, Susumu Ohno stated that gene duplication was the single most important factor in evolution (97). He reiterated this point a few years later in proposing that without duplicated genes the creation of metazoans, vertebrates, and mammals from unicellular organisms would have been impossible. Such big leaps in evolution, he argued, required the creation of new gene loci with previously nonexistent functions (98). Bold statements such as these, combined with his proposal that at least one whole-genome duplication event facilitated the evolution of vertebrates, have made Ohno an icon in the literature on genome evolution. However, discussion on the occurrence and consequences of gene and genome duplication events has a much longer, and often neglected, history. Here we review literature dealing with the occurence and consequences of gene duplication, begining in 1911. We document conceptual and technological advances in gene duplication research from this early research in comparative cytology up to recent research on whole genomes, “transcriptomes,” and “interactomes.” We have formerly seen that parts many times repeated are eminently liable to vary in number and structure; consequently it is quite probable that natural selection, during the long-continued course of modification, should have seized on a certain number of the primordially similar elements, many times repeated, and have adapted them to the most diverse purposes. Charles Darwin, 1859 (23)}, + issue = {Volume 38, 2004}, + langid = {english} +} + +@article{taylorUsingGalaxyPerform2007, + title = {Using Galaxy to Perform Large-Scale Interactive Data Analyses}, + author = {Taylor, James and Schenck, Ian and Blankenberg, Dan and Nekrutenko, Anton}, + date = {2007-09}, + journaltitle = {Current Protocols in Bioinformatics}, + shortjournal = {Curr Protoc Bioinformatics}, + volume = {Chapter 10}, + eprint = {18428782}, + eprinttype = {pmid}, + pages = {Unit 10.5}, + issn = {1934-340X}, + doi = {10.1002/0471250953.bi1005s19}, + abstract = {While most experimental biologists know where to download genomic data, few have a concrete plan on how to analyze it. This situation can be corrected by: (1) providing unified portals serving genomic data and (2) building Web applications to allow flexible retrieval and on-the-fly analyses of the data. Powerful resources, such as the UCSC Genome Browser already address the first issue. The second issue, however, remains open. For example, how to find human protein-coding exons with the highest density of single nucleotide polymorphisms (SNPs) and extract orthologous sequences from all sequenced mammals? Indeed, one can access all relevant data from the UCSC Genome Browser. But once the data is downloaded how would one deal with millions of SNPs and gigabytes of alignments? Galaxy (http://g2.bx.psu.edu) is designed specifically for that purpose. It amplifies the strengths of existing resources (such as UCSC Genome Browser) by allowing the user to access and, most importantly, analyze data within a single interface in an unprecedented number of ways.}, + langid = {english}, + pmcid = {PMC3418382}, + keywords = {Algorithms,Base Sequence,Chromosome Mapping,Computer Graphics,DNA,DNA Mutational Analysis,Molecular Sequence Data,Sequence Alignment,Sequence Analysis DNA,Software,User-Computer Interface} +} + +@article{vandongenGraphClusteringDiscrete2008a, + title = {Graph {{Clustering Via}} a {{Discrete Uncoupling Process}}}, + author = {Van Dongen, Stijn}, + date = {2008-01}, + journaltitle = {SIAM Journal on Matrix Analysis and Applications}, + shortjournal = {SIAM J. Matrix Anal. Appl.}, + volume = {30}, + number = {1}, + pages = {121--141}, + publisher = {{Society for Industrial and Applied Mathematics}}, + issn = {0895-4798}, + doi = {10.1137/040608635}, + url = {https://epubs.siam.org/doi/10.1137/040608635}, + urldate = {2024-03-22}, + abstract = {We generalize the concepts of sign symmetry and weak sign symmetry by defining k-sign symmetric matrices. For a positive integer k, we show that all diagonal shifts of an irreducible matrix are k-sign symmetric if and only if the matrix is diagonally similar to a Hermitian matrix. A similar result holds for scalar shifts, but requires an additional condition in the case \$k = 1\$. Extensions are given to reducible matrices.} +} + +@article{vandongenNewClusterAlgorithm1998, + title = {A New Cluster Algorithm for Graphs}, + author = {family=Dongen, given=S., prefix=van, useprefix=true}, + date = {1998-01-01}, + number = {R 9814}, + url = {https://ir.cwi.nl/pub/4604}, + urldate = {2024-03-22}, + abstract = {A new cluster algorithm for graphs called the emph\{Markov Cluster algorithm (\$MCL\$ algorithm) is introduced. The graphs may be both weighted (with nonnegative weight) and directed. Let\textasciitilde\$G\$\textasciitilde be such a graph. The \$MCL\$ algorithm simulates flow in \$G\$ by first identifying \$G\$ in a canonical way with a Markov graph \$G\_1\$. Flow is then alternatingly expanded and contracted, leading to a row of Markov Graphs \$G\_\{(i)\$. The expansion step is done by computing higher step transition probabilities (\$TP\$'s), the contraction step creates a new Markov graph by favouring high \$TP\$'s and demoting low \$TP\$'s in a specific way. The heuristic underlying this approach is the expectation that flow between dense regions which are sparsely connected will evaporate. The stable limits of the process are easily derived and in practice the algorithm converges very fast to such a limit, the structure of which has a generic interpretation as an overlapping clustering of the graph\textasciitilde\$G\$. Overlap is limited to cases where the input graph has a symmetric structure inducing it. The contraction and expansion parameters of the algorithm influence the granularity of the output. The algorithm is space and time efficient with a space\$+\$quality/time trade--off, works very well for a wide range of test cases, and lends itself to drastic scaling. Experiments with a scaled \$C\$--implementation have been conducted on graphs having several tens of thousands of nodes. This report describes the algorithm, its complexity, and experimental results. The algorithm is introduced by first considering a generalization of generic single link clustering for graphs called \$k\$--path clustering.\vphantom{\}\}}}, + langid = {english} +} + +@article{wolfeRobustnessItNot2000, + title = {Robustness—It's Not Where You Think It Is}, + author = {Wolfe, Ken}, + date = {2000-05}, + journaltitle = {Nature Genetics}, + shortjournal = {Nat Genet}, + volume = {25}, + number = {1}, + pages = {3--4}, + issn = {1061-4036, 1546-1718}, + doi = {10.1038/75560}, + url = {https://www.nature.com/articles/ng0500_3}, + urldate = {2024-03-28}, + langid = {english} +} + +@article{yangWGDdetectorPipelineDetecting2019, + title = {{{WGDdetector}}: A Pipeline for Detecting Whole Genome Duplication Events Using the Genome or Transcriptome Annotations}, + shorttitle = {{{WGDdetector}}}, + author = {Yang, Yongzhi and Li, Ying and Chen, Qiao and Sun, Yongshuai and Lu, Zhiqiang}, + date = {2019-12}, + journaltitle = {BMC Bioinformatics}, + shortjournal = {BMC Bioinformatics}, + volume = {20}, + number = {1}, + pages = {75}, + issn = {1471-2105}, + doi = {10.1186/s12859-019-2670-3}, + url = {https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-2670-3}, + urldate = {2024-03-19}, + abstract = {Background: With the availability of well-assembled genomes of a growing number of organisms, identifying the bioinformatic basis of whole genome duplication (WGD) is a growing field of genomics. The most extant software for detecting footprints of WGDs has been restricted to a well-assembled genome. However, the massive poor quality genomes and the more accessible transcriptomes have been largely ignored, and in theoretically they are also likely to contribute to detect WGD using dS based method. Here, to resolve these problems, we have designed a universal and simple technical tool WGDdetector for detecting WGDs using either genome or transcriptome annotations in different organisms based on the widely used dS based method. Results: We have constructed WGDdetector pipeline that integrates all analyses including gene family constructing, dS estimating and phasing, and outputting the dS values of each paralogs pairs processed with only one command. We further chose four species (Arabidopsis thaliana, Juglans regia, Populus trichocarpa and Xenopus laevis) representing herb, wood and animal, to test its practicability. Our final results showed a high degree of accuracy with the previous studies using both genome and transcriptome data. Conclusion: WGDdetector is not only reliable and stable for genome data, but also a new way to using the transcriptome data to obtain the correct dS distribution for detecting WGD. The source code is freely available, and is implemented in Windows and Linux operation system.}, + langid = {english} +} + +@article{zhangLandscapeVariationNovel2017, + title = {Landscape and Variation of Novel Retroduplications in 26 Human Populations}, + author = {Zhang, Yan and Li, Shantao and Abyzov, Alexej and Gerstein, Mark B.}, + date = {2017-06-29}, + journaltitle = {PLOS Computational Biology}, + shortjournal = {PLOS Computational Biology}, + volume = {13}, + number = {6}, + pages = {e1005567}, + publisher = {Public Library of Science}, + issn = {1553-7358}, + doi = {10.1371/journal.pcbi.1005567}, + url = {https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1005567}, + urldate = {2024-03-27}, + abstract = {Retroduplications come from reverse transcription of mRNAs and their insertion back into the genome. Here, we performed comprehensive discovery and analysis of retroduplications in a large cohort of 2,535 individuals from 26 human populations, as part of 1000 Genomes Phase 3. We developed an integrated approach to discover novel retroduplications combining high-coverage exome and low-coverage whole-genome sequencing data, utilizing information from both exon-exon junctions and discordant paired-end reads. We found 503 parent genes having novel retroduplications absent from the reference genome. Based solely on retroduplication variation, we built phylogenetic trees of human populations; these represent superpopulation structure well and indicate that variable retroduplications are effective population markers. We further identified 43 retroduplication parent genes differentiating superpopulations. This group contains several interesting insertion events, including a SLMO2 retroduplication and insertion into CAV3, which has a potential disease association. We also found retroduplications to be associated with a variety of genomic features: (1) Insertion sites were correlated with regular nucleosome positioning. (2) They, predictably, tend to avoid conserved functional regions, such as exons, but, somewhat surprisingly, also avoid introns. (3) Retroduplications tend to be co-inserted with young L1 elements, indicating recent retrotranspositional activity, and (4) they have a weak tendency to originate from highly expressed parent genes. Our investigation provides insight into the functional impact and association with genomic elements of retroduplications. We anticipate our approach and analytical methodology to have application in a more clinical context, where exome sequencing data is abundant and the discovery of retroduplications can potentially improve the accuracy of SNP calling.}, + langid = {english}, + keywords = {Functional genomics,Gene expression,Genomics,Human genomics,Nucleosomes,Phylogenetic analysis,Pseudogenes,Single nucleotide polymorphisms} +} diff --git a/report.md b/report.md new file mode 100644 index 0000000..05acbd9 --- /dev/null +++ b/report.md @@ -0,0 +1,174 @@ +
+keywords: duplicate genes, tandemly arrayed genes, pipeline +
+