feat: Add duplicate gene fate in HTML export

This commit is contained in:
Samuel Ortion 2024-04-19 21:25:39 +02:00
parent 30807f2bd0
commit a4bf5bd5ce
Signed by: sortion
GPG Key ID: 9B02406F8C4FB765
3 changed files with 150 additions and 147 deletions

View File

@ -6,12 +6,12 @@
id="Layer_1"
x="0px"
y="0px"
width="210mm"
height="297mm"
viewBox="0 0 793.7008 1122.5197"
width="211.93124mm"
height="111.38958mm"
viewBox="0 0 800.99999 420.99999"
enable-background="new 0 0 576 360"
xml:space="preserve"
sodipodi:docname="Evolution_fate_duplicate_genes_-_vector.svg"
sodipodi:docname="Evolution_fate_duplicate_genes.svg"
inkscape:version="1.2.2 (b0a8486541, 2022-12-01)"
inkscape:export-filename="Evolution_fate_duplicate_genes.pdf"
inkscape:export-xdpi="96"
@ -152,23 +152,23 @@
inkscape:pagecheckerboard="0"
inkscape:deskcolor="#d1d1d1"
showgrid="true"
inkscape:zoom="1.1319444"
inkscape:cx="468.22086"
inkscape:cy="138.69939"
inkscape:window-width="1920"
inkscape:window-height="1007"
inkscape:window-x="0"
inkscape:zoom="0.5659722"
inkscape:cx="295.0675"
inkscape:cy="190.82209"
inkscape:window-width="1280"
inkscape:window-height="995"
inkscape:window-x="1920"
inkscape:window-y="0"
inkscape:window-maximized="1"
inkscape:current-layer="Layer_1"
inkscape:document-units="mm"><inkscape:grid
type="xygrid"
id="grid2647"
originx="0"
originy="0" /></sodipodi:namedview>
originx="0.49999999"
originy="0.49999998" /></sodipodi:namedview>
<g
id="g67"
transform="translate(106.477,32.225)">
transform="translate(106.977,32.725)">
<linearGradient
id="SVGID_1_"
gradientUnits="userSpaceOnUse"
@ -280,7 +280,7 @@
<g
id="g34">
<polygon
points="243.4,18.441 232.954,22.71 235.433,18.441 232.954,14.174 "
points="235.433,18.441 232.954,14.174 243.4,18.441 232.954,22.71 "
id="polygon32" />
</g>
</g>
@ -605,7 +605,7 @@
</g>
<g
id="g134"
transform="translate(106.477,32.225)">
transform="translate(106.977,32.725)">
<linearGradient
id="SVGID_5_"
gradientUnits="userSpaceOnUse"
@ -717,7 +717,7 @@
<g
id="g101">
<polygon
points="243.4,109.107 232.954,113.377 235.433,109.107 232.954,104.84 "
points="235.433,109.107 232.954,104.84 243.4,109.107 232.954,113.377 "
id="polygon99" />
</g>
</g>
@ -1041,8 +1041,8 @@
</g>
</g>
<rect
x="488.31104"
y="44.083"
x="488.81104"
y="44.583"
fill="none"
width="191"
height="31.433001"
@ -1050,23 +1050,23 @@
<text
id="text142"
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:16px;font-family:'TeX Gyre Termes';-inkscape-font-specification:'TeX Gyre Termes, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal"
x="488.31104"
y="54.723"><tspan
x="488.31104"
y="54.723"
x="488.81104"
y="55.223"><tspan
x="488.81104"
y="55.223"
font-family="'Verdana'"
font-size="14"
id="tspan138"
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:16px;font-family:'TeX Gyre Termes';-inkscape-font-specification:'TeX Gyre Termes, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal">Gene with four</tspan><tspan
x="488.31104"
y="71.523003"
x="488.81104"
y="72.023003"
font-family="'Verdana'"
font-size="14"
id="tspan140"
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:16px;font-family:'TeX Gyre Termes';-inkscape-font-specification:'TeX Gyre Termes, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal">different functions</tspan></text>
<g
id="g152"
transform="translate(106.477,32.225)">
transform="translate(106.977,32.725)">
<g
id="g150">
@ -1084,14 +1084,14 @@
<g
id="g148">
<polygon
points="287.833,87.775 282.142,73.847 287.833,77.152 293.523,73.847 "
points="287.833,77.152 293.523,73.847 287.833,87.775 282.142,73.847 "
id="polygon146" />
</g>
</g>
</g>
<g
id="g219"
transform="translate(106.477,32.225)">
transform="translate(106.977,32.725)">
<linearGradient
id="SVGID_9_"
gradientUnits="userSpaceOnUse"
@ -1203,7 +1203,7 @@
<g
id="g186">
<polygon
points="243.4,157.411 232.954,161.681 235.433,157.411 232.954,153.144 "
points="235.433,157.411 232.954,153.144 243.4,157.411 232.954,161.681 "
id="polygon184" />
</g>
</g>
@ -1533,8 +1533,8 @@
<rect
x="410.82098"
y="97.083"
x="411.32098"
y="97.583"
fill="none"
width="196.65601"
height="15.716"
@ -1544,12 +1544,12 @@
font-size="14"
id="text545"
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:16px;font-family:'TeX Gyre Termes';-inkscape-font-specification:'TeX Gyre Termes, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal"
x="410.8208"
y="107.723">Duplication</text>
x="411.3208"
y="108.223">Duplication</text>
<rect
x="503.06"
y="214.90001"
x="503.56"
y="215.40001"
fill="none"
width="169.136"
height="15.717"
@ -1559,14 +1559,14 @@
font-size="14"
id="text559"
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:16px;font-family:'TeX Gyre Termes';-inkscape-font-specification:'TeX Gyre Termes, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal"
x="503.05902"
y="225.53951">Divergence</text>
x="503.55902"
y="226.03951">Divergence</text>
<path
fill="none"
stroke="#000000"
stroke-width="2"
stroke-miterlimit="10"
d="m 394.31,225.725 c 0,4.053 6.038,7.333 13.5,7.333 h 255.803 c 7.463,0 13.5,3.28 13.5,7.333"
d="m 394.81,226.225 c 0,4.053 6.038,7.333 13.5,7.333 h 255.803 c 7.463,0 13.5,3.28 13.5,7.333"
id="path561"
sodipodi:nodetypes="cssc" />
<path
@ -1574,12 +1574,12 @@
stroke="#000000"
stroke-width="2"
stroke-miterlimit="10"
d="m 149.81,240.391 c 0,-4.053 6.038,-7.333 13.5,-7.333 h 217.5 c 7.463,0 13.5,-3.28 13.5,-7.333"
d="m 150.31,240.891 c 0,-4.053 6.038,-7.333 13.5,-7.333 h 217.5 c 7.463,0 13.5,-3.28 13.5,-7.333"
id="path563"
sodipodi:nodetypes="cssc" />
<g
id="g573"
transform="translate(192.477,32.225)">
transform="translate(192.977,32.725)">
<g
id="g571">
@ -1597,14 +1597,14 @@
<g
id="g569">
<polygon
points="484.637,216.151 490.327,212.847 484.637,226.775 478.945,212.847 "
points="484.637,226.775 478.945,212.847 484.637,216.151 490.327,212.847 "
id="polygon567" />
</g>
</g>
</g>
<g
id="g583"
transform="translate(58.477,32.225)">
transform="translate(58.977,32.725)">
<g
id="g581">
@ -1622,14 +1622,14 @@
<g
id="g579">
<polygon
points="91.333,216.151 97.023,212.847 91.333,226.775 85.641,212.847 "
points="91.333,226.775 85.641,212.847 91.333,216.151 97.023,212.847 "
id="polygon577" />
</g>
</g>
</g>
<rect
x="107.47701"
y="370.89999"
x="107.97701"
y="371.39999"
fill="none"
width="179.33299"
height="15.717"
@ -1639,11 +1639,11 @@
font-size="14"
id="text587"
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:16px;font-family:'TeX Gyre Termes';-inkscape-font-specification:'TeX Gyre Termes, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal"
x="243.80266"
y="386.63199">Subfunctionalization</text>
x="244.30266"
y="387.13199">Subfunctionalization</text>
<rect
x="304.811"
y="370.89999"
x="305.311"
y="371.39999"
fill="none"
width="179.334"
height="15.717"
@ -1653,11 +1653,11 @@
font-size="14"
id="text591"
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:16px;font-family:'TeX Gyre Termes';-inkscape-font-specification:'TeX Gyre Termes, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal"
x="417.17331"
y="386.65601">Neofunctionalization</text>
x="417.67331"
y="387.15601">Neofunctionalization</text>
<rect
x="498.14301"
y="370.89999"
x="498.64301"
y="371.39999"
fill="none"
width="183.334"
height="15.717"
@ -1667,11 +1667,11 @@
font-size="14"
id="text595"
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:16px;font-family:'TeX Gyre Termes';-inkscape-font-specification:'TeX Gyre Termes, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal"
x="592"
y="385">Degeneration/Gene loss</text>
x="592.5"
y="385.5">Degeneration/Gene loss</text>
<g
id="g3995"
transform="translate(112.477,32.225)"><g
transform="translate(112.977,32.725)"><g
id="g2911"
transform="translate(82.74739)"><g
id="g286">
@ -1786,7 +1786,7 @@
<g
id="g253">
<polygon
points="243.4,246.546 232.954,250.815 235.433,246.546 232.954,242.278 "
points="235.433,246.546 232.954,242.278 243.4,246.546 232.954,250.815 "
id="polygon251" />
</g>
</g>
@ -2173,7 +2173,7 @@
<g
id="g306">
<polygon
points="243.4,299.715 232.954,303.984 235.433,299.715 232.954,295.447 "
points="235.433,299.715 232.954,295.447 243.4,299.715 232.954,303.984 "
id="polygon304" />
</g>
</g>
@ -2562,7 +2562,7 @@
<g
id="g359">
<polygon
points="38.433,246.546 35.954,242.278 46.4,246.546 35.954,250.815 "
points="46.4,246.546 35.954,250.815 38.433,246.546 35.954,242.278 "
id="polygon357" />
</g>
</g>
@ -2949,7 +2949,7 @@
<g
id="g412">
<polygon
points="38.433,299.715 35.954,295.447 46.4,299.715 35.954,303.984 "
points="46.4,299.715 35.954,303.984 38.433,299.715 35.954,295.447 "
id="polygon410" />
</g>
</g>
@ -3386,7 +3386,7 @@
<g
id="g479">
<polygon
points="429.91,242.278 440.357,246.546 429.91,250.815 432.39,246.546 "
points="429.91,250.815 432.39,246.546 429.91,242.278 440.357,246.546 "
id="polygon477" />
</g>
</g>
@ -4143,7 +4143,7 @@
<g
id="g101-5">
<polygon
points="232.954,113.377 235.433,109.107 232.954,104.84 243.4,109.107 "
points="232.954,104.84 243.4,109.107 232.954,113.377 235.433,109.107 "
id="polygon99-6" />
</g>
</g>
@ -4579,7 +4579,7 @@
<g
id="g186-4">
<polygon
points="232.954,161.681 235.433,157.411 232.954,153.144 243.4,157.411 "
points="232.954,153.144 243.4,157.411 232.954,161.681 235.433,157.411 "
id="polygon184-4" />
</g>
</g>
@ -4916,10 +4916,10 @@
font-size="14"
id="text4095"
style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:16px;font-family:'TeX Gyre Termes';-inkscape-font-specification:'TeX Gyre Termes, Normal';font-variant-ligatures:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-east-asian:normal"
x="49.471992"
y="385">Functional redoundancy</text><g
x="49.971992"
y="385.5">Functional redoundancy</text><g
id="g4702"
transform="translate(-18.471606)"><g
transform="translate(-17.971606,0.5)"><g
id="g555"
transform="matrix(1,0,0,0.7957841,29.4895,79.036561)">
<g
@ -4939,7 +4939,7 @@
<g
id="g551">
<polygon
points="293.523,212.847 287.833,226.775 282.142,212.847 287.833,216.152 "
points="282.142,212.847 287.833,216.152 293.523,212.847 287.833,226.775 "
id="polygon549" />
</g>
</g>
@ -4963,7 +4963,7 @@
<g
id="g4686">
<polygon
points="287.833,226.775 282.142,212.847 287.833,216.152 293.523,212.847 "
points="287.833,216.152 293.523,212.847 287.833,226.775 282.142,212.847 "
id="polygon4684" />
</g>
</g>

Before

Width:  |  Height:  |  Size: 201 KiB

After

Width:  |  Height:  |  Size: 201 KiB

View File

@ -27,7 +27,7 @@
| singleton | singleton | A gene with a single copy |
| polyploidisation | polyploidisation | Mechanism leading to the acquisition of at least three versions of the same original genome in a species |
| pseudogene | pseudogene | A gene-like sequence that lost its capacity to transcribe |
| segment_duplcation | segment duplication | Long stretches of DNA sequences with high identity score |
| segment_duplication | segment duplication | Long stretches of DNA sequences with high identity score |
| retroduplication | retroduplication | Duplication of a gene through retro-transcription of its RNA transcript |
| autopolyploidisation | autopolyploidisation | Polyploidisation within the same species |
| allopolyploidisation | allopolyploidisation | Polyploidisation with genetic material coming from a diverged species |
@ -66,29 +66,26 @@
#+end_export
* Scientific context
\lettrine{D}uplicate genes represent an important fraction of Eukaryotic genes: It is estimated that between 46% and 65.5% of human genes could be considered as duplicate[fn:: The estimate vary strongly depending on the criteria in use, because ancient duplication event may be hard to detect.] [cite:@correaTransposableElementEnvironment2021].
[[latex:lettrine][D]]uplicate genes represent an important fraction of Eukaryotic genes: It is estimated that between 46% and 65.5% of human genes could be considered as duplicate[fn:: The estimate vary strongly depending on the criteria in use, because ancient duplication event may be hard to detect.] [cite:@correaTransposableElementEnvironment2021].
Duplicate genes offers a pool of genetic material available for further experimentation during species evolution.
** Gene duplication mechanisms
#+begin_src emacs-lisp :exports results :results value raw
(setq fig:gene-duplication-mechanisms "
#+label: fig:gene-duplication-mechanisms
#+caption[Different types of duplication] Different types of duplication. (A) Whole genome duplication. (B) An unequal crossing-over leads to a duplication of a fragment of a chromosome. (C) In tandem duplication, two (set of) genes are duplicated one after the other. (D) Retrotransposon enables retroduplication: a RNA transcript is reverse transcribed and inserted back without introns and with a polyA tail in the genome. (E) A DNA transposon can acquire a fragment of a gene. (F) Segmental duplication corresponds to long stretches of duplicated sequences with high identity. Adapted from [cite:@lallemandOverviewDuplicatedGene2020] (fig. 1)
#+begin_src emacs-lisp :exports results :results raw
(setq fig:gene-duplication-mechanisms "#+label: fig:gene-duplication-mechanisms
#+caption[Different types of duplication]: Different types of duplication. (A) Whole genome duplication. (B) An unequal crossing-over leads to a duplication of a fragment of a chromosome. (C) In tandem duplication, two (set of) genes are duplicated one after the other. (D) Retrotransposon enables retroduplication: a RNA transcript is reverse transcribed and inserted back without introns and with a polyA tail in the genome. (E) A DNA transposon can acquire a fragment of a gene. (F) Segmental duplication corresponds to long stretches of duplicated sequences with high identity. Adapted from [cite:@lallemandOverviewDuplicatedGene2020] (fig. 1)
[[./figures/lallemand2020-fig1_copy.svg]]")
(if (eq org-export-current-backend 'html)
fig:gene-duplication-mechanisms
"")
""
)
#+end_src
#+RESULTS: fig:gene-duplication-mechanisms
#+begin_export latex
\fladdfig{
\includegraphics[width=.9\linewidth]{./figures/lallemand2020-fig1_copy.pdf}
\caption[Different types of duplication]{\label{fig:gene-duplication-mechanisms}Different types of duplication. (A) Whole genome duplication. (B) An unequal crossing-over leads to a duplication of a fragment of a chromosome. (C) In tandem duplication, two (set of) genes are duplicated one after the other. (D) Retrotransposon enables retroduplication: a RNA transcript is reverse transcribed and inserted back without introns and with a polyA tail in the genome. (E) A DNA transposon can acquire a fragment of a gene. (F) Segmental duplication corresponds to long stretches of duplicated sequences with high identity. Adapted from \autocite{lallemandOverviewDuplicatedGene2020} (fig. 1).}
\caption[Different types of duplication]{\label{fig:gene-duplication-mechanisms}Different types of duplication. (A) Whole genome duplication. (B) An unequal crossing-over leads to a duplication of a fragment of a chromosome. (C) In tandem duplication, two (set of) genes are duplicated one after the other. (D) Retrotransposon enables retroduplication: a RNA transcript is reverse transcribed and inserted back without introns and with a polyA tail in the genome. (E) A DNA transposon can acquire a fragment of a gene. (F) Segmental duplication corresponds to long stretches of duplicated sequences with high identity. (Adapted from \textcite{lallemandOverviewDuplicatedGene2020} (fig. 1)).}
}
#+end_export
@ -123,12 +120,23 @@ A typical DNA transposon contains a transposase gene. This enzyme recognizes two
Finally, glspl:segment_duplication, also called /low copy repeats/ are long stretches of DNA with high identity score ([[cref:fig:gene-duplication-mechanisms]] (F)). Their exact duplication mechanism remains unclear [cite:@lallemandOverviewDuplicatedGene2020]. They may come from an accidental replication, distinct from an uneven cross-over or a double stranded breakage.
Transposable elements may well be involved in the mechanism, as a high enrichment of transposable elements is found next to duplicate segment extremities, in /Drosophila/ [cite:@lallemandOverviewDuplicatedGene2020].
#+begin_src emacs-lisp :exports results :results raw
(setq fig:duplicate-genes-fate "#+label: fig:duplicate-genes-fate
,#+caption[Fate of duplicate genes]: Fate of duplicate genes. An original gene with four functions is duplicated. Its two copies may both keep the original functions (functional redoundancy). The original functions may split between the different copies (subfunctionalization). One of the copy may acquire a new function (neofunctionalization). It may also degenerate and lose its original functions (pseudogenization). Adapted from [[https://commons.wikimedia.org/wiki/File:Evolution_fate_duplicate_genes_-_vector.svg][Smedlib]], [[https://creativecommons.org/licenses/by-sa/4.0][CC BY-SA 4.0]] via Wikimedia Commons.
[[./figures/Evolution_fate_duplicate_genes.svg]]")
(if (eq org-export-current-backend 'html)
fig:duplicate-genes-fate
""
)
#+end_src
#+RESULTS:
#+begin_export latex
\fladdfig{
\includegraphics[width=.9\linewidth]{figures/Evolution_fate_duplicate_genes.pdf}
\caption[Fate of duplicate genes]{\label{fig:fate-duplicate-genes} Fate of duplicate genes. An original gene with four functions is duplicated. Its two copies may both keep the original functions (functional redoundancy). The original functions may split between the different copies (subfunctionalization). One of the copy may acquire a new function (neofunctionalization). It may also degenerate and lose its original functions (pseudogenization).
Adapted from \href{https://commons.wikimedia.org/wiki/File:Evolution_fate_duplicate_genes_-_vector.svg}{Smedlib}, \href{https://creativecommons.org/licenses/by-sa/4.0}{CC BY-SA 4.0}, via Wikimedia Commons}
\caption[Fate of duplicate genes]{\label{fig:fate-duplicate-genes} Fate of duplicate genes. An original gene with four functions is duplicated. Its two copies may both keep the original functions (functional redoundancy). The original functions may split between the different copies (subfunctionalization). One of the copy may acquire a new function (neofunctionalization). It may also degenerate and lose its original functions (pseudogenization). (Adapted from \href{https://commons.wikimedia.org/wiki/File:Evolution_fate_duplicate_genes_-_vector.svg}{Smedlib}, \href{https://creativecommons.org/licenses/by-sa/4.0}{CC BY-SA 4.0}, via Wikimedia Commons).}
}
#+end_export
@ -149,7 +157,38 @@ Two duplicate genes with the same original function may encounter a gls:subfunct
# *** Functional redundancy
Another possibility is that the two gene copies keep the ancestral function, resulting in a functional redoundancy. In this case the quantity of gene product may increase.
** Methods to identify duplicate genes
* Objectives for the internship
** Scientific questions
The underlying question of FTAG Finder is the study of the evolutionary fate of duplicate genes in Eukaryotes.
Duplicate genes are
** Extend the existing FTAG Finder Galaxy pipeline
Galaxy is a web-based platform for running accessible data analysis pipelines, first designed for use in genomics data analysis [cite:@goecksGalaxyComprehensiveApproach2010].
Last year, Séanna [[latex:textsc][Charles]] worked on the Galaxy version of the FTAG Finder pipeline during her M1 internship [cite:@charlesFinalisationPipelineFTAG2023]. I will continue this work.
FTAG Finder is currently deployed on the server of the /Laboratoire de Mathématiques et Modélisation d'Évry/[fn:: [[http://stat.genopole.cnrs.fr/galaxy]] ].
** Port FTAG Finder pipeline on a workflow manager
Another objective of my internship will be to port FTAG Finder on a workflow manager better suited to larger and more reproducible analysis.
We will have to make a choice for the tool we will use.
The two main options being Snakemake and Nextflow. Snakemake is a python powered workflow manager based on rules /à la/ GNU Make [cite:@kosterSnakemakeScalableBioinformatics2012]. Nextflow is a groovy powered workflow manager, which rely on the data flows paradigm [cite:@ditommasoNextflowEnablesReproducible2017]. Both are widely used in the bioinformatics community. Their use have been on the rise since they came out in 2012 and 2013 respectively [cite:@djaffardjyDevelopingReusingBioinformatics2023].
# #+begin_export latex
# \fladdtab{
# \begin{tabular}{ccc}
# \toprule
# & List ref & List $L$ \\
# \midrule
# related to $go$ & $a$ & $b$ \\
# unrelated & $c$ & $d$ \\
# \bottomrule
# \end{tabular}
# \caption{\label{tab:fisher-test-contigency-table}Contingency table for a Fisher exact test on gene lists}
# }
# #+end_export
* Methodological approaches
** Duplicate gene detection method used in FTAG Finder
#+begin_export latex
\fladdfig{
\includegraphics[width=.9\linewidth]{./figures/tag-definition.pdf}
@ -162,81 +201,49 @@ Different methods exists to detect duplicate genes. These methods depend on the
Paralogs are homologous genes derived from a duplication event. We can identify them as homologous genes coming from the same genome, or as homologous genes between different species once we filtered out gls:orthologues (homologous genes derived from a speciation event).
We can use two gene characteristics to assess the homology between two genes: gene structure or sequence similarity.
The sequence similarity can be tested with a sequence alignment tool, such as =BLAST= [cite:@altschulBasicLocalAlignment1990], =Psi-BLAST=, and =HMMER3= [cite:@johnsonHiddenMarkovModel2010], or =diamond= [cite:@buchfinkSensitiveProteinAlignments2021], which are heuristic algorithms, which means they may not provide the best results, but do so way faster than exact algorithms, such as the classical Smith and Waterman algorithm [cite:@smithIdentificationCommonMolecular1981] or its optimized versions =PARALIGN= [cite:@rognesParAlignParallelSequence2001] or =SWIMM=.
This is the case for Triticum aestivum hybridisation, which consisted in the union of the
chromosome set of a Triticum species with those of an Aegilops species
*** FTAG Finder
Developed in the LaMME laboratory, the FTAG Finder (Families and Tandemly Arrayed Genes Finder) pipeline is a simple pipeline targeting the detection of gls:TAG from the proteome of single species [cite:@bouillonFTAGFinderOutil2016].
The sequence similarity can be tested with a sequence alignment tool, such as =BLAST= [cite:@altschulBasicLocalAlignment1990], =Psi-BLAST=, and =HMMER3= [cite:@johnsonHiddenMarkovModel2010], or =diamond= [cite:@buchfinkSensitiveProteinAlignments2021]. These tools are heuristic algorithms, which means they may not provide the best results, but do so way faster than exact algorithms, such as the classical Smith and Waterman algorithm [cite:@smithIdentificationCommonMolecular1981] or its optimized versions =PARALIGN= [cite:@rognesParAlignParallelSequence2001] or =SWIMM=.
The pipeline proceeds in three steps. First, it estimates the homology links between each pair of genes. Then, it deduces the gene families. Finally, it searches for gls:TAG.
*** FTAG Finder
Developed in the LaMME laboratory, the FTAG Finder (Families and Tandemly Arrayed Genes Finder) pipeline is a simple pipeline targeting the detection of gls:TAG based on the sequence of the proteome of single species [cite:@bouillonFTAGFinderOutil2016].
The pipeline proceeds in three steps. First, it estimates the homology links between each pair of genes. Then, it deduces the gene families. Finally, it searches for gls:TAG, relying on the position of genes belonging to the same family.
**** Estimation of homology links between genes
This step consists in establishing a homology relationship between each genes in the proteome.
In this step, the typical tool involved is =BLAST= (Basic Local Alignment Search Tool) [cite:@altschulBasicLocalAlignment1990] run "all against all" on the proteome.
In this step, FTAG Finder uses =BLAST= (Basic Local Alignment Search Tool) [cite:@altschulBasicLocalAlignment1990] with an "all against all" search on the proteome.
Several =BLAST= metrics can be used as an homology measure, such as bitscore, identity percentage, E-value or variations of these. The choice of metrics can affect the results of graph clustering in the following step, and we should therefore chose them carefully [cite:@gibbonsEvaluationBLASTbasedEdgeweighting2015].
Several =BLAST= metrics can be used as an homology measure, such as bitscore, identity percentage, E-value or a variation on these. The choice of metrics can affect the results of graph clustering in the following step, and we should therefore chose them carefully [cite:@gibbonsEvaluationBLASTbasedEdgeweighting2015].
**** Identification of gene families
Based on the homology links between each pair of genes, we construct an undirected weighted graph whose vertices correspond to genes and edges to homology links between them.
We apply a graph clustering algorithm on the graph in order to infer the gene families corresponding to densely connected communities of vertices.
FTAG Finder proposes three clustering algorithm alternatives: single linkage, Markov Clustering [cite:@vandongenNewClusterAlgorithm1998] or Walktrap [cite:@ponsComputingCommunitiesLarge2005].
Based on the homology links between each pair of genes, we construct an undirected weighted graph whose vertices correspond to genes and whose edges corresponds to homology links between them.
We apply a graph clustering algorithm on the homology gene graph in order to infer the gene families corresponding to densely connected communities of vertices.
FTAG Finder proposes three graph clustering algorithm alternatives: single linkage, Markov Clustering [cite:@vandongenNewClusterAlgorithm1998] or Walktrap [cite:@ponsComputingCommunitiesLarge2005].
**** Detection of TAGs
**** Detection of TAG
The final step of FTAG Finder consists in the identification of gls:TAG from the gene families and the positions of genes.
For a given chromosome, the tool seeks genes belonging to the same family and located close to each other. The tool allows a maximal number of genes between the homologous genes, with a parameter set by the user. Cref:fig:tag-definitions is a schematic representation of some possible gls:TAG positioning on a genome associated with their definition in FTAG Finder /Find Tags/ step.
For a given chromosome, the tool seeks genes belonging to the same family and located close to each other. The tool allows a maximal number of genes between the homologous genes, with a parameter set by the user. Cref:fig:tag-definitions is a schematic representation of some possible gls:TAG positioning on a genome associated with their definition in this FTAG Finder step.
* Objectives for the internship
** Scientific questions
The underlying question of FTAG Finder is the study of the evolutionary fate of duplicate genes in Eukaryotes.
Duplicate genes are
** Extend the existing FTAG Finder Galaxy pipeline
Galaxy is a web-based platform for running accessible data analysis pipelines, first designed for use in genomics data analysis [cite:@goecksGalaxyComprehensiveApproach2010].
Last year, Séanna [[latex:textsc][Charles]] worked on the Galaxy version of the FTAG Finder pipeline during her M1 internship [cite:@charlesFinalisationPipelineFTAG2023]. I will continue this work.
FTAG Finder is currently deployed on the server of the /Laboratoire d'Analyse et Modélisation d'Évry/[fn: [[http://stat.genopole.cnrs.fr/galaxy]] ].
** Analyses performed on TAG
** Port FTAG Finder pipeline on a workflow manager
Another objective of my internship will be to port FTAG Finder on a workflow manager better suited to larger and more reproducible analysis.
FTAG Finder output consist mostly in list of genes, corresponding to TAG of various definition. These list can be subsequently used as the basis of more specific statistical analysis.
We will have to make a choice for the tool we will use.
The two main options being Snakemake and Nextflow. Snakemake is a python powered workflow manager based on rules /à la/ GNU Make [cite:@kosterSnakemakeScalableBioinformatics2012]. Nextflow is a groovy powered workflow manager, which rely on the data flows paradigm [cite:@ditommasoNextflowEnablesReproducible2017]. Both are widely used in the bioinformatics community. Their use have been on the rise since they came out in 2012 and 2013 respectively [cite:@djaffardjyDevelopingReusingBioinformatics2023].
*** Are there over-represented gene functions among TAG
#+begin_export latex
\fladdtab{
\begin{tabular}{ccc}
\toprule
& List ref & List $L$ \\
\midrule
related to $go$ & $a$ & $b$ \\
unrelated & $c$ & $d$ \\
\bottomrule
\end{tabular}
\caption{\label{tab:fisher-test-contigency-table}Contingency table for a Fisher exact test on gene lists}
}
#+end_export
* Methodological approaches
The gls:GO describes biological concepts across three main classes: Cellular Component, Molecular Function and Biological Process. It describes a controlled vocabulary of concepts and the relationships between them. We can link genes with function annotation with particular GO terms. We can then perform an GO enrichment analysis to assess whether a particular GO term is over-represented in a particular gene list, compared to another. To do so, we can use a Fisher exact test, using the FDR (False Discovery Rate) control procedure of [[latex:textsc][Benjamini]] and [[latex:textsc][Hocheberg]].
Based on the output of the FTAG Finder pipeline, which consist in lists of genes, researchers could perform bespoke subsequent analyses on TAGs.
# Let $go$ be a GO term. We construct a contingency matrix based on the count of genes associated with this GO term (or associated with one of its brother GO term) for the reference gene list and the list of interest (here, the list of genes in a TAG) (see cref:tab:fisher-test-contigency-table).
*** Are TAG located preferentially on specific chromosome region?
** Analysis of over-represented gene functions among TAGs
*** Are there chromosomes enriched or depleted in TAG?
The gls:GO describes biological concepts across three main classes: Cellular Component, Molecular Function and Biological Process. It describe a controlled vocabulary of concepts and the relationship between them. The genes with known functions can be associated with a particular GO term. We can perform an GO enrichment analysis to assess whether a particular GO term is over-represented in a particular gene list, compared to an other. We can use a Fisher exact test, using the FDR (False Discovery Rate) control procedure of [[latex:textsc][Benjamini]] and [[latex:textsc][Hocheberg]] to do so.
Let $go$ be a GO term. We construct a contingency matrix based on the count of genes associated with this GO term (or associated with one of its brother GO term) for the reference gene list and the list of interest (here, the list of genes in a TAG) (see cref:tab:fisher-test-contigency-table).
** Are TAG located preferentially on specific chromosome region?
** Are there chromosomes enriched or depleted in TAG?
** Do genes located next to each other in a TAG share the same orientation?
*** Do genes located next to each other in a TAG share the same orientation?
The concordance of two genes of a TAG falls in three possible cases: either both genes are on the same strand (\(\rightarrow \rightarrow\)), either they have a divergent orientation (\(\leftarrow \rightarrow\)), or a convergent one (\(\rightarrow \leftarrow\)). Graham conjectured that genes of a TAG that are close to each other would be more likely to share the same orientation, and it seems to be effectively the case [cite:@shojaRoadmapTandemlyArrayed2006].
# To test this, we can use a $\Chi^2$ test of goodness of fit or a Student $t$-test.
*** TODO write down the hypotheses
*** What is the robustness and accuracy of the detection method?
** What is the robustness and accuracy of the detection method?
[cite/t:@le-hoangEtudeTranscriptomiqueGenes2017] started analyses of the impact of parameter choice on FTAG Finder output lists. A more detailed benchmark of FTAG Finder in comparison with other methods on some known test dataset might be of particular interest.
[cite/t:@le-hoangEtudeTranscriptomiqueGenes2017] started analyzing the impact of parameter choice on FTAG Finder results. A more detailed benchmark of FTAG Finder in comparison with other methods on some controlled test dataset might be of particular interest.
This would pose the challenge of homogenization of the outputs of the different methods.
#+begin_export latex
\flstop
@ -247,9 +254,7 @@ The concordance of two genes of a TAG falls in three possible cases: either both
:UNNUMBERED: t
:END:
#+begin_export latex
\printbibliography[heading=none]
#+end_export
#+print_bibliography:
#+begin_export latex
\cleartoleftpage
@ -299,10 +304,8 @@ Principle: construct vertex communities based on where an agent would get stuck
: Loaded ./setup.el
#+begin_example
# LocalWords: speciation subfunctionalization neofunctionalization
# LocalWords: pseudogenization bioinformatics
# Local Variables:
# eval: (progn (org-babel-goto-named-src-block "startup") (org-babel-execute-src-block) (outline-hide-sublevels 1))
# End:
#+end_example
# # Local Variables:
# # eval: (progn (org-babel-goto-named-src-block "startup") (org-babel-execute-src-block) (outline-hide-sublevels 1))
# # End:

BIN
report.pdf (Stored with Git LFS)

Binary file not shown.