diff --git a/notebook.org b/notebook.org
new file mode 100644
index 0000000..d7fd400
--- /dev/null
+++ b/notebook.org
@@ -0,0 +1,1009 @@
+#+title: Comparative Genomics Project
+#+subtitle: Duplicate Genes in /Glycine max/
+#+date: 2024-2025
+#+author: Samuel Ortion
+#+LATEX_CLASS: scrartcl
+#+LATEX_HEADER: \titlehead{M2 GENIOMHE}
+
+* General infos
+
+<2024-10-25 Fri>
+
+** Objectives
+*** Part 1
+1. Determine the amount of duplicate genes in my genome.
+   - What is the proportion of duplicate genes in my genome.
+   - Give an interval based on two datasets (low stringency dataset and high stringency dataset).
+     #+caption: High stringency dataset and low stringency dataset duplicate genes filtering criteria.
+     #+name: dataset-criteria
+     |            | Low                           | High                          |
+     |------------+-------------------------------+-------------------------------|
+     | Coverage   | 30%                           | 40%                           |
+     | %identity  | 30%                           | 50%                           |
+     | Bitscore   | no filtering                  | no filtering                  |
+     | Clustering | mcl (with default parameters) | mcl (with default parameters) |
+
+     #+name: low-stringency
+     #+begin_src bash
+min_coverage=30
+min_identity=30
+     #+end_src
+
+     #+name: high-stringency
+     #+begin_src bash
+min_coverage=40
+min_identity=50
+     #+end_src
+2. Determine the amount of Tandemly Arrayed Genes (TAGs) in the genome (definition 0 and 1).
+3. Determine the amount in plant's chloroplasts and mitochondria.
+4. Is there a lot of duplicated operons in plants?
+
+*** Part 2
+1. Compute \(K_s\) values and use it as a proxy of the age of the duplication.
+2. Are TAGs younger than other duplicate genes?
+
+
+** Our species
+
+/Glycine max/ is the soy plant.
+
+* Download the source data
+
+https://plants.ensembl.org/Glycine_max/Info/Index
+
+https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-60/fasta/glycine_max/pep/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz
+
+
+* Develop the Nextflow workflow
+
+** Filtering the proteome to keep the longest isoform
+
+I used three awk families_and_TAGs/scripts to filter the FASTA file to keep only the longest isoforms:
+ 1. =protein_lengths.awk= associate each FASTA record with the length of its sequence,
+ 2. =filter_longest.awk= keep the isoform ID that has the maximum length,
+ 3. =filter_records.awk= based on the list of isoform ID, retrieve the sequence of from the original file.
+
+=protein_lengths.awk=
+
+#+include: workflow/families_and_TAGs/scripts/protein_lengths.awk src awk
+
+=filter_longest.awk=
+
+#+include: workflow/families_and_TAGs/scripts/filter_longest.awk src awk
+
+=filter_records.awk=
+
+#+include: workflow/families_and_TAGs/scripts/filter_records_fasta.awk src awk
+
+How many protein coding gene are there?
+#+name: protein-coding-gene-count
+#+begin_src bash :export both
+zcat ../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz \
+    | grep -v 'supercontig' \
+    | awk ' /^>/ { print $4 }' | sort | uniq | wc -l
+#+end_src
+
+#+RESULTS: protein-coding-gene-count
+: 55589
+
+#+RESULTS:
+: 55897
+
+** Filtering out Chloroplasts and Mitochondria proteins
+
+From the Ensembl Plants index page we can see that there is no chloroplast nor mitochondria proteins in the sequenced genome of /Glycine max/.
+
+** Filter out supercontigs
+
+<2024-10-28 Mon>
+
+#+include: workflow/families_and_TAGs/scripts/remove_supercontigs.awk src awk
+
+#+begin_src bash
+zcat ../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz \
+    | awk -f ./workflow/families_and_TAGs/scripts/remove_supercontigs.awk \
+    > ./tmp/Glycine_max.Glycine_max_v2.1.pep.all.nosupercontig.fa
+#+end_src
+
+#+RESULTS:
+
+/Sanity check:/ Has the script removed all supercontigs proteins and only them?
+
+/How many records in supercontigs were there?/
+#+begin_src bash :export both
+zcat ../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz  \
+    | awk -F' ' ' /^>/ { print $3}'  | grep -c "^supercontig"
+#+end_src
+
+#+RESULTS:
+: 435
+
+/How many records have been removed?/
+
+#+name: all-isoforms
+#+begin_src bash
+zgrep -c "^>" ../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz
+#+end_src
+
+#+RESULTS: all-isoforms
+: 88412
+
+#+name: all-isoforms-no-supercontig
+#+begin_src bash
+grep -c "^>" ./tmp/Glycine_max.Glycine_max_v2.1.pep.all.nosupercontig.fa
+#+end_src
+
+#+RESULTS: all-isoforms-no-supercontig
+: 87977
+
+#+begin_src bash :var ALL_ISOFORMS=all-isoforms :var NO_SUPERCONTIGS=all-isoforms-no-supercontig :exports both
+expr $ALL_ISOFORMS - $NO_SUPERCONTIGS
+#+end_src
+
+#+RESULTS:
+: 435
+
+** BLASTp All Against All
+
+#+begin_src bash
+makeblastdb -in "$proteome" -dbtype prot -out 'tmp/blastdb' -title "Glycine max"
+#+end_src
+
+#+begin_src bash
+blastp -query "$proteome" -db 'tmp/blastdb' -outfmt 6 -out "tmp/Glycine_max.blastp.tsv"
+#+end_src
+
+** Clustering
+
+Using =mcl=.
+
+1. Extract the homology in ABC format
+   #+begin_src bash
+awk 'BEGIN { OFS="\t" } { print \$14, \$16, \$12 }' "${blastp}" > 'graph.abc'
+   #+end_src
+2. Run =mcl= with default parameters
+   #+begin_src bash
+mcl 'graph.abc' --abc -o 'custering.mcl'
+   #+end_src
+3. Convert =mcl= output to TSV with two columns =gene id=, =family id=.
+   #+include: workflow/families_and_TAGs/scripts/mcl_to_tsv.awk src awk
+
+   #+begin_src bash
+awk -f workflow/families_and_TAGs/scripts/mcl_to_tsv.awk 'clustering.mcl' > 'families.tsv'
+   #+end_src
+
+
+** Running the Nextflow workflow
+
+Update the parameters in =nextflow.config= and run
+#+begin_src bash
+nextflow run main.nf -profile conda -resume
+#+end_src
+
+from the =./workflow/= folder
+
+
+* From the given BLASTp format 6 file
+<2024-11-02 Sat>
+/How many BLASTp HSP have been reported?/
+#+begin_src bash
+cat ../data/Glycine_max_Blastp_longIsoforme | wc -l
+#+end_src
+
+#+RESULTS:
+: 4456730
+
+** Remove HSP on proteins found on supercontigs
+
+Extract the isoform ID whose gene is on supercontigs.
+#+begin_src bash
+zcat ../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz| awk -F' ' '/^>/ && $3 ~ /^supercontig/ { sequence_id = $1; gsub(">", "", sequence_id); print sequence_id }' > ./tmp/protein_on_supercontigs.list
+#+end_src
+
+#+RESULTS:
+
+#+begin_src bash
+cat ./tmp/protein_on_supercontigs.list | wc -l
+#+end_src
+
+#+RESULTS:
+: 435
+
+Remove HSP with subject or query being on a supercontig
+
+=filter_blastp_sequence_id.awk=
+
+#+include: workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk src awk
+
+#+begin_src bash
+awk -f workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk ./tmp/protein_on_supercontigs.list ../data/Glycine_max_Blastp_longIsoforme > ./tmp/Glycine_max_Blastp_longIsoforme_nosupercontig
+#+end_src
+
+#+RESULTS:
+
+How many HSP have been removed?
+
+#+begin_src bash
+length() {
+    cat $1 | wc -l
+ }
+expr $(length ../data/Glycine_max_Blastp_longIsoforme) - $(length ./tmp/Glycine_max_Blastp_longIsoforme_nosupercontig)
+#+end_src
+
+#+RESULTS:
+: 36598
+
+#+begin_src bash
+cat ./tmp/Glycine_max_Blastp_longIsoforme_nosupercontig | wc -l
+#+end_src
+
+#+RESULTS:
+: 4420132
+
+
+** Filter by coverage and identity percentage
+
+*** Add protein length and gene name columns
+
+**** Extract the protein length and gene id from the FASTA file
+
+
+#+begin_src bash
+zcat ../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz | awk -f ./workflow/families_and_TAGs/scripts/protein_lengths.awk > ./tmp/Glycine_max_protein_length_gene.tsv
+#+end_src
+
+#+RESULTS:
+
+
+**** Join the BLASTp HSP file with the gene id and gene lengths
+
+#+begin_src bash
+protein_length=./tmp/Glycine_max_protein_length_gene.tsv
+blastp=./tmp/Glycine_max_Blastp_longIsoforme_nosupercontig
+sort -k 1 "${blastp}" > tmp/blastp_s
+sort -k 1 "${protein_length}" > tmp/protein_length_s
+join -1 1 -2 1 -t $'\t' tmp/blastp_s tmp/protein_length_s > tmp/join1.tsv
+sort -k 2 tmp/join1.tsv > tmp/join1.tsv_s
+join -1 2 -2 1 -t $'\t' tmp/join1.tsv_s tmp/protein_length_s > tmp/joined.blastp.tsv
+#+end_src
+
+#+RESULTS:
+
+**** Filter the BLASTp according to coverage and identity percentage
+
+#+name: filter-blastp
+#+begin_src bash
+blastp=./tmp/joined.blastp.tsv
+awk -f "workflow/families_and_TAGs/scripts/filter_blastp.awk" \
+        -v min_coverage="${min_coverage}" \
+        -v min_identity="${min_identity}" \
+        "${blastp}" > "tmp/Glycine_max_Blastp_filtered_coverage${min_coverage}_identity${min_identity}.tsv"
+#+end_src
+
+#+RESULTS: filter-blastp
+
+***** First dataset
+
+#+begin_src bash :noweb strip-export
+<<low-stringency>>
+<<filter-blastp>>
+#+end_src
+
+#+RESULTS:
+
+#+begin_src bash :noweb strip-export
+<<high-stringency>>
+<<filter-blastp>>
+#+end_src
+
+#+RESULTS:
+
+#+begin_src bash
+cat ./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.tsv | wc -l
+#+end_src
+
+#+RESULTS:
+: 1520850
+
+#+begin_src bash
+cat ./tmp/Glycine_max_Blastp_filtered_coverage40_identity50.tsv | wc -l
+#+end_src
+
+#+RESULTS:
+: 342180
+
+** Clustering
+<2024-11-03 Sun>
+*** Extract the homology graph
+
+#+name: extract-abc
+#+begin_src bash
+blastp="tmp/Glycine_max_Blastp_filtered_coverage${min_coverage}_identity${min_identity}.tsv"
+abc="tmp/Glycine_max_Blastp_filtered_coverage${min_coverage}_identity${min_identity}.abc"
+awk 'BEGIN { FS = OFS = "\t" } { print $14, $16, $12 }' "$blastp" > "$abc"
+#+end_src
+
+
+#+begin_src bash :noweb strip-export
+<<low-stringency>>
+<<extract-abc>>
+#+end_src
+
+#+RESULTS:
+
+#+begin_src bash :noweb strip-export
+<<high-stringency>>
+<<extract-abc>>
+#+end_src
+
+#+RESULTS:
+
+#+begin_src bash
+head -n 1 "tmp/Glycine_max_Blastp_filtered_coverage30_identity30.abc"
+#+end_src
+
+#+RESULTS:
+| GLYMA_10G098500 | GLYMA_U008600 | 154 |
+
+*** Filter the homology graph to keep only the heaviest weight
+
+
+#+include: workflow/families_and_TAGs/scripts/keep_heaviest_edge_abc.awk src awk
+
+#+name: filter-abc
+#+begin_src bash
+original_abc="tmp/Glycine_max_Blastp_filtered_coverage${min_coverage}_identity${min_identity}.abc"
+filtered_abc="tmp/Glycine_max_Blastp_filtered_coverage${min_coverage}_identity${min_identity}_f.abc"
+awk -f "workflow/families_and_TAGs/scripts/keep_heaviest_edge_abc.awk" "$original_abc" > "$filtered_abc"
+#+end_src
+
+#+begin_src bash :noweb strip-export
+<<low-stringency>>
+<<filter-abc>>
+#+end_src
+
+#+RESULTS:
+
+#+begin_src bash :noweb strip-export
+<<high-stringency>>
+<<filter-abc>>
+#+end_src
+
+#+RESULTS:
+
+*** Run Markov Clustering
+
+#+name: mcl
+#+begin_src bash
+abc="tmp/Glycine_max_Blastp_filtered_coverage${min_coverage}_identity${min_identity}_f.abc"
+mcl="tmp/Glycine_max_Blastp_filtered_coverage${min_coverage}_identity${min_identity}.mcl"
+mcl "${abc}" --abc --discard-loops=yes -o "$mcl"
+#+end_src
+
+#+RESULTS: mcl
+
+#+begin_src bash :noweb strip-export
+<<low-stringency>>
+<<mcl>>
+#+end_src
+
+#+RESULTS:
+
+#+begin_src bash :noweb strip-export
+<<high-stringency>>
+<<mcl>>
+#+end_src
+
+#+RESULTS:
+
+*** Transform MCL output into a gene to family mapping TSV file
+
+#+name: mcl-to-tsv
+#+begin_src bash
+mcl="tmp/Glycine_max_Blastp_filtered_coverage${min_coverage}_identity${min_identity}.mcl"
+tsv="tmp/Glycine_max_Blastp_filtered_coverage${min_coverage}_identity${min_identity}.mcl.tsv"
+awk -f "workflow/families_and_TAGs/scripts/mcl_to_tsv.awk" "${mcl}" > "${tsv}"
+#+end_src
+
+#+begin_src bash :noweb strip-export
+<<low-stringency>>
+<<mcl-to-tsv>>
+#+end_src
+
+#+RESULTS:
+
+#+begin_src bash :noweb strip-export
+<<high-stringency>>
+<<mcl-to-tsv>>
+#+end_src
+
+#+RESULTS:
+
+#+name: duplicate-genes-count-high-stringency
+#+begin_src bash
+tsv="tmp/Glycine_max_Blastp_filtered_coverage40_identity50.mcl.tsv"
+cat "${tsv}" | wc -l
+#+end_src
+
+#+RESULTS: duplicate-genes-count-high-stringency
+: 46769
+
+#+name: duplicate-genes-count-low-stringency
+#+begin_src bash
+tsv="tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv"
+cat "${tsv}" | wc -l
+#+end_src
+
+#+RESULTS: duplicate-genes-count-low-stringency
+: 50254
+
+#+begin_src bash
+tsv="tmp/Glycine_max_Blastp_filtered_coverage40_identity50.mcl.tsv"
+head "${tsv}"
+#+end_src
+
+#+RESULTS:
+| GLYMA_12G034500 | 1 |
+| GLYMA_12G138300 | 1 |
+| GLYMA_12G124600 | 1 |
+| GLYMA_12G126700 | 1 |
+| GLYMA_12G125200 | 1 |
+| GLYMA_12G034900 | 1 |
+| GLYMA_U036400   | 1 |
+| GLYMA_12G124200 | 1 |
+| GLYMA_12G126300 | 1 |
+| GLYMA_12G035100 | 1 |
+
+#+begin_src bash
+tsv="tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv"
+head "${tsv}"
+#+end_src
+
+#+RESULTS:
+| GLYMA_07G053000 | 1 |
+| GLYMA_10G247700 | 1 |
+| GLYMA_12G188000 | 1 |
+| GLYMA_07G159200 | 1 |
+| GLYMA_13G313300 | 1 |
+| GLYMA_15G016600 | 1 |
+| GLYMA_14G159200 | 1 |
+| GLYMA_18G287500 | 1 |
+| GLYMA_20G132900 | 1 |
+| GLYMA_16G204800 | 1 |
+
+What is the interval of duplicate gene count? There are between 48109 and 49391 duplicate genes in /Glycine max/.
+
+What is the number of families?
+
+#+begin_src bash
+tsv="tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv"
+cut -f2 "${tsv}" | sort | uniq | wc -l
+#+end_src
+
+#+RESULTS:
+: 8426
+
+#+begin_src bash
+tsv="tmp/Glycine_max_Blastp_filtered_coverage40_identity50.mcl.tsv"
+cut -f2 "${tsv}" | sort | uniq | wc -l
+#+end_src
+
+#+RESULTS:
+: 11997
+
+What is the number of singletons?
+
+#+name: singletons-low-stringency
+#+begin_src bash :var ALL=protein-coding-gene-count :var DUPLICATED=duplicate-genes-count-low-stringency
+expr $ALL - $DUPLICATED
+#+end_src
+
+#+RESULTS: singletons-low-stringency
+: 5643
+
+#+name: singletons-high-stringency
+#+begin_src bash :var ALL=protein-coding-gene-count :var DUPLICATED=duplicate-genes-count-high-stringency
+expr $ALL - $DUPLICATED
+#+end_src
+
+#+RESULTS: singletons-high-stringency
+: 9128
+
+* Amount of Tandemly Arrayed Genes
+
+*** Extract gene position from a GFF3 file
+
+#+include: workflow/families_and_TAGs/scripts/gff3_to_gene_positions_table.awk src awk
+
+#+begin_src bash
+zcat "../data/Glycine_max.Glycine_max_v2.1.60.chr.gff3.gz" \
+    | awk -f "./workflow/families_and_TAGs/scripts/gff3_to_gene_positions_table.awk" \
+    > "./tmp/Glycine_max.gene_positions.tsv"
+#+end_src
+
+#+RESULTS:
+
+#+begin_src bash
+head "./tmp/Glycine_max.gene_positions.tsv"
+#+end_src
+
+#+RESULTS:
+| GLYMA_01G000100 | 1 |  27355 |  28320 |
+| GLYMA_01G000200 | 1 |  58975 |  67527 |
+| GLYMA_01G000300 | 1 |  67770 |  69968 |
+| GLYMA_01G000400 | 1 |  90152 |  95947 |
+| GLYMA_01G000500 | 1 |  90289 |  91197 |
+| GLYMA_01G000600 | 1 | 116094 | 127845 |
+| GLYMA_01G000700 | 1 | 143467 | 155573 |
+| GLYMA_01G000800 | 1 | 157030 | 157772 |
+| GLYMA_01G000900 | 1 | 170534 | 193342 |
+| GLYMA_01G001000 | 1 | 196256 | 201895 |
+
+#+name: gene-positions-count
+#+begin_src bash
+cat "./tmp/Glycine_max.gene_positions.tsv" \
+    | wc -l
+#+end_src
+
+#+RESULTS:
+: 55589
+
+There is 55897 protein coding genes in the sequence file, but only 55589 gene positions.
+
+#+begin_src bash :var POSITIONS=gene-positions-count :var GENES=protein-coding-gene-count :exports both
+expr $GENES - $POSITIONS
+#+end_src
+
+#+RESULTS:
+: 308
+
+*** Extract TAGs
+
+#+name: extract-TAGs
+#+begin_src bash
+positions="./tmp/Glycine_max.gene_positions.tsv"
+families="./tmp/Glycine_max_Blastp_filtered_coverage${min_coverage}_identity${min_identity}.mcl.tsv"
+./rust/tagfinder/target/release/tagfinder \
+    --families "$families" \
+    --positions "$positions" \
+    --definitions 0,1 \
+    > "./tmp/Glycine_max_TAGs_coverage${min_coverage}_identity${min_identity}.tsv"
+#+end_src
+
+#+RESULTS: extract-TAGs
+
+#+begin_src bash :noweb strip-export
+<<low-stringency>>
+<<extract-TAGs>>
+#+end_src
+
+#+RESULTS:
+
+#+begin_src bash :noweb strip-export
+<<high-stringency>>
+<<extract-TAGs>>
+#+end_src
+
+#+RESULTS:
+
+# LocalWords:  operons
+
+#+begin_src bash
+head "./tmp/Glycine_max_TAGs_coverage30_identity30.tsv"
+#+end_src
+
+#+RESULTS:
+| gene            | family | tag0 | tag1 |
+| GLYMA_07G257400 |   3109 | -    | -    |
+| GLYMA_17G203900 |   2067 | -    | -    |
+| GLYMA_20G062000 |    509 | -    | -    |
+| GLYMA_16G099400 |     40 | -    | -    |
+| GLYMA_14G045400 |   1136 | 804  | 896  |
+| GLYMA_11G189200 |     15 | -    | -    |
+| GLYMA_02G191500 |   1310 | -    | -    |
+| GLYMA_02G308200 |    520 | -    | -    |
+| GLYMA_14G205200 |      8 | -    | -    |
+
+How many TAGs for definition 0?
+
+#+begin_src bash
+awk 'BEGIN {
+           max=0
+    }
+    NR > 1 {
+        if ($3 > max) {
+            max=$3
+        }
+    }
+    END  {
+        print max
+    }
+    ' "./tmp/Glycine_max_TAGs_coverage30_identity30.tsv"
+#+end_src
+
+#+RESULTS:
+: 3208
+
+For TAG1?
+
+#+begin_src bash
+awk 'BEGIN {
+           max=0
+    }
+    NR > 1 {
+        if ($4 > max) {
+            max=$4
+        }
+    }
+    END  {
+        print max
+    }
+    ' "./tmp/Glycine_max_TAGs_coverage30_identity30.tsv"
+#+end_src
+
+#+RESULTS:
+: 3481
+
+
+#+begin_src bash
+awk 'BEGIN {
+           max=0
+    }
+    NR > 1 {
+        if ($3 > max) {
+            max=$3
+        }
+    }
+    END  {
+        print max
+    }
+    ' "./tmp/Glycine_max_TAGs_coverage40_identity50.tsv"
+#+end_src
+
+#+RESULTS:
+: 2500
+
+
+#+begin_src bash
+awk 'BEGIN {
+           max=0
+    }
+    NR > 1 {
+        if ($4 > max) {
+            max=$4
+        }
+    }
+    END  {
+        print max
+    }
+    ' "./tmp/Glycine_max_TAGs_coverage40_identity50.tsv"
+#+end_src
+
+#+RESULTS:
+: 2652
+
+
+*** Size of the greatest TAG
+
+**** High stringency
+
+/TAG₀/
+
+#+begin_src bash
+awk '
+    {
+    tag_index=$3
+    if (tag_index != "-")
+        gene_count[tag_index]++
+    }
+    END {
+    max = 0
+    max_tag_index = ""
+    for (tag_index in gene_count) {
+        if (gene_count[tag_index] > max) {
+            max_tag_index = tag_index
+            max = gene_count[tag_index]
+        }
+    }
+    print max_tag_index, max
+    }' "./tmp/Glycine_max_TAGs_coverage40_identity50.tsv"
+#+end_src
+
+#+RESULTS:
+: 421 53
+
+/TAG₁/
+
+#+begin_src bash
+awk '
+    {
+    tag_index=$4
+    if (tag_index != "-")
+        gene_count[tag_index]++
+    }
+    END {
+    max = 0
+    max_tag_index = ""
+    for (tag_index in gene_count) {
+        if (gene_count[tag_index] > max) {
+            max_tag_index = tag_index
+            max = gene_count[tag_index]
+        }
+    }
+    print max_tag_index, max
+    }' "./tmp/Glycine_max_TAGs_coverage40_identity50.tsv"
+#+end_src
+
+#+RESULTS:
+: 459 76
+
+**** Low stringency
+
+
+/TAG₀/
+
+#+begin_src bash
+awk '
+    {
+    tag_index=$3
+    if (tag_index != "-")
+        gene_count[tag_index]++
+    }
+    END {
+    max = 0
+    max_tag_index = ""
+    for (tag_index in gene_count) {
+        if (gene_count[tag_index] > max) {
+            max_tag_index = tag_index
+            max = gene_count[tag_index]
+        }
+    }
+    print max_tag_index, max
+    }' "./tmp/Glycine_max_TAGs_coverage30_identity30.tsv"
+#+end_src
+
+#+RESULTS:
+: 579 76
+
+/TAG₁/
+
+#+begin_src bash
+awk '
+    {
+    tag_index=$4
+    if (tag_index != "-")
+        gene_count[tag_index]++
+    }
+    END {
+    max = 0
+    max_tag_index = ""
+    for (tag_index in gene_count) {
+        if (gene_count[tag_index] > max) {
+            max_tag_index = tag_index
+            max = gene_count[tag_index]
+        }
+    }
+    print max_tag_index, max
+    }' "./tmp/Glycine_max_TAGs_coverage30_identity30.tsv"
+#+end_src
+
+#+RESULTS:
+: 646 76
+
+* DONE Complete slides
+<2024-11-23 Sat>
+
+* Family sizes
+
+#+begin_src bash
+head -1 ./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl \
+    | awk '{ print NF }'
+#+end_src
+
+#+RESULTS:
+: 437
+
+#+begin_src bash
+head -1 ./tmp/Glycine_max_Blastp_filtered_coverage40_identity50.mcl \
+    | awk '{ print NF }'
+#+end_src
+
+#+RESULTS:
+: 159
+
+
+* TODO $K_s$ For all duplicated pairs within a family
+
+Let $A$ be a family of $n$ duplicate members. The number of duplicate pairs is $n(n-1) / 2$.
+
+PAML package:
+#+begin_src bash
+conda install paml pal2nal clustalw -c bioconda
+#+end_src
+
+** Use a Nextflow workflow
+<2024-12-22 Sun>
+
+*** All pair of duplicate genes
+
+Generated by a rust program in =rust/pairs= folder.
+The algorithm is simple.
+We assume the gene-families mapping file is sorted by family index.
+For each family, loop for i from 0 to the size of the family minus one, loop for j from i plus one to the size of the family and output the pair gene i - gene j.
+
+
+Example:
+
+#+begin_src bash
+cat ./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv | ./rust/pairs/target/release/pairs > ./tmp/Glycine_max_duplicate_gene_pairs.tsv
+head ./tmp/Glycine_max_duplicate_gene_pairs.tsv
+#+end_src
+
+#+RESULTS:
+| GLYMA_07G053000   | GLYMA_10G247700 |
+| GLYMA_07G053000 | GLYMA_12G188000 |
+| GLYMA_07G053000 | GLYMA_07G159200 |
+| GLYMA_07G053000 | GLYMA_13G313300 |
+| GLYMA_07G053000 | GLYMA_15G016600 |
+| GLYMA_07G053000 | GLYMA_14G159200 |
+| GLYMA_07G053000 | GLYMA_18G287500 |
+| GLYMA_07G053000 | GLYMA_20G132900 |
+| GLYMA_07G053000 | GLYMA_16G204800 |
+| GLYMA_07G053000 | GLYMA_08G302300 |
+
+*** PAML Steps
+Example for the first gene pair.
+
+/Extract the CDS sequence and protein sequence of the genes/
+
+#+name: pair-gene-id
+#+begin_src bash
+gene_id_1="GLYMA_07G053000"
+gene_id_2="GLYMA_10G247700"
+#+end_src
+
+CDS:
+
+
+#+name: pair-gene-cds-extract
+#+begin_src bash :noweb strip-export
+output_file="./tmp/${gene_id_1}_${gene_id_2}.cds.fst"
+echo "" > "${output_file}"
+for gene_id in "${gene_id_1}" "${gene_id_2}"; do
+        zcat ../data/Glycine_max.Glycine_max_v2.1.cds.all.fa.gz | awk -v gene_id="${gene_id}" '
+        BEGIN {
+            on_gene=0
+        }
+        /^[^>]/ && on_gene == 1 {
+            print $0
+        }
+        /^>/ {
+            gene = $4
+            gsub("gene:", "", gene)
+            if (gene == gene_id) {
+                on_gene=1
+                print $0
+            } else {
+                on_gene=0
+            }
+        }
+        ' >> "${output_file}"
+done
+#+end_src
+
+#+RESULTS: pair-gene-cds-extract
+
+#+begin_src bash :noweb strip-export
+<<pair-gene-id>>
+<<pair-gene-cds-extract>>
+#+end_src
+
+#+RESULTS:
+
+Proteins:
+
+#+name: pair-gene-protein-extract
+#+begin_src bash :noweb strip-export
+output_file="./tmp/${gene_id_1}_${gene_id_2}.prot.fst"
+echo "" > "${output_file}"
+for gene_id in "${gene_id_1}" "${gene_id_2}"; do
+        cat ./tmp/proteome_filtered.fa | awk -v gene_id="${gene_id}" '
+        BEGIN {
+            on_gene=0
+        }
+        /^[^>]/ && on_gene == 1 {
+            print $0
+        }
+        /^>/ {
+            gene = $4
+            gsub("gene:", "", gene)
+            if (gene == gene_id) {
+                on_gene=1
+                print $0
+            } else {
+                on_gene=0
+            }
+        }
+        ' >> "${output_file}"
+done
+#+end_src
+
+#+RESULTS: pair-gene-protein-extract
+
+
+#+begin_src bash :noweb strip-export
+<<pair-gene-id>>
+<<pair-gene-protein-extract>>
+#+end_src
+
+#+RESULTS:
+
+/Clustalw2 alignment/
+
+#+begin_src bash :noweb strip-export
+<<pair-gene-id>>
+protein_sequence="./tmp/${gene_id_1}_${gene_id_2}.prot.fst"
+clustalw2 -quiet -align -infile="${protein_sequence}" -outfile="./tmp/${gene_id_1}_${gene_id_2}.prot.ali.aln"
+#+end_src
+
+#+RESULTS:
+
+/Convert protein alignment to coding sequence alignment/
+#+begin_src bash :noweb strip-export
+<<pair-gene-id>>
+protein_alignment="./tmp/${gene_id_1}_${gene_id_2}.prot.ali.aln"
+coding_sequence="./tmp/${gene_id_1}_${gene_id_2}.cds.fst"
+pal2nal "${protein_alignment}" "${coding_sequence}" -output paml > "./tmp/${gene_id_1}_${gene_id_2}.cds.ali.phy"
+#+end_src
+
+
+/YN00 Ka-Ks computation/
+
+#+begin_src bash :noweb strip-export
+<<pair-gene-id>>
+phylip_file="./tmp/${gene_id_1}_${gene_id_2}.cds.ali.phy"
+echo "seqfile = ${phylip_file}\noutfile = ${phylip_file}.yn\nverbose = 0\nicode = 0\nweighting = 0\ncommonf3x4 = 0" > yn00.ctl
+yn00
+#+end_src
+
+/Extraction of Ka-Ks values/
+
+#+begin_src bash :noweb strip-export
+<<pair-gene-id>>
+yn_file="./tmp/${gene_id_1}_${gene_id_2}.cds.ali.phy.yn"
+awk '
+    BEGIN {
+          on_good_section=0
+          skip=0
+    }
+    $1 == "(B)" {
+         skip=8
+         on_good_section=1
+    }
+
+    on_good_section == 1 {
+        if (skip == 0) {
+            Ka=$8
+            Ks=$11
+            print Ka, Ks
+            exit
+        } else {
+            skip -= 1
+        }
+    }
+    ' "${yn_file}"
+#+end_src
+
+#+RESULTS:
+: 0.6207 1.9409
+
+*** Run the Nextflow workflow
+<2024-12-23 Mon>
+
+
+* TODO Does /Glycine max/ have big TAGs and which function are the big TAG gene implied?
+
+
+* TODO Are TAG pairs different in age from non-TAG pairs?
+
+* TODO Are genes inside a TAG orientated in the same way more often than random?
diff --git a/workflow/KaKs/conda/clustalw.yaml b/workflow/KaKs/conda/clustalw.yaml
new file mode 100644
index 0000000..05a370b
--- /dev/null
+++ b/workflow/KaKs/conda/clustalw.yaml
@@ -0,0 +1,6 @@
+name: clustalw
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - bioconda::clustalw
diff --git a/workflow/KaKs/conda/pal2nal.yaml b/workflow/KaKs/conda/pal2nal.yaml
new file mode 100644
index 0000000..4370ce2
--- /dev/null
+++ b/workflow/KaKs/conda/pal2nal.yaml
@@ -0,0 +1,6 @@
+name: pal2nal
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - bioconda::pal2nal
diff --git a/workflow/KaKs/conda/paml.yaml b/workflow/KaKs/conda/paml.yaml
new file mode 100644
index 0000000..3604566
--- /dev/null
+++ b/workflow/KaKs/conda/paml.yaml
@@ -0,0 +1,6 @@
+name: paml
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - bioconda::paml
diff --git a/workflow/KaKs/main.nf b/workflow/KaKs/main.nf
new file mode 100644
index 0000000..196d970
--- /dev/null
+++ b/workflow/KaKs/main.nf
@@ -0,0 +1,39 @@
+/** Compute Ka, Ks values for each
+/** pair of duplicate genes in a TAG
+/** and in a gene family
+/** */
+
+include { GUNZIP as GUNZIP_CDS } from "./modules/gunzip.nf"
+include { KA_KS } from "./modules/ka_ks.nf"
+
+process GENE_PAIRS {
+    input:
+    path families
+    output:
+    path 'pairs'
+    script:
+    """
+    cat "${families}" | ${baseDir}/../../rust/pairs/target/release/pairs > 'pairs'
+    """
+}
+
+workflow {
+    families = params.families
+    cds_gz = params.cds
+    proteome = params.proteome
+
+    cds = GUNZIP_CDS(cds_gz)
+
+    pairs = GENE_PAIRS(families)
+            .splitCsv(sep: '\t', header: false)
+            .map { row -> tuple(row[0], row[1]) }
+
+
+    kaks = KA_KS(pairs, proteome, cds)
+
+    // Save to a CSV with collectFile
+    header = Channel.value("gene_id_1\tgene_id_2\tKa\tKs")
+    header.concat( kaks )
+        .collectFile( name: 'test.txt', newLine: true, sort: false )
+        .view() 
+}
diff --git a/workflow/KaKs/modules/gunzip.nf b/workflow/KaKs/modules/gunzip.nf
new file mode 100644
index 0000000..2c68cf6
--- /dev/null
+++ b/workflow/KaKs/modules/gunzip.nf
@@ -0,0 +1,12 @@
+process GUNZIP {
+    input:
+    path zipped_file
+
+    output:
+    path zipped_file.baseName
+
+    script:
+    """
+    gunzip -c "${zipped_file}" > "${zipped_file.baseName}"
+    """
+}
diff --git a/workflow/KaKs/modules/ka_ks.nf b/workflow/KaKs/modules/ka_ks.nf
new file mode 100644
index 0000000..481b3e8
--- /dev/null
+++ b/workflow/KaKs/modules/ka_ks.nf
@@ -0,0 +1,132 @@
+
+process EXTRACT_TWO_PROTEINS {
+    input:
+    tuple(val(gene_id_1), val(gene_id_2))
+    path fasta_file
+    output:
+    path "${gene_id_1}_${gene_id_2}.prot.fst"
+    script:
+    output_file="${gene_id_1}_${gene_id_2}.prot.fst"
+    """
+    bash "${baseDir}/scripts/extract_fasta_records.sh" "${gene_id_1}" "${gene_id_2}" "${fasta_file}" "${output_file}"
+     """
+}
+
+process EXTRACT_TWO_CDS {
+    input:
+    tuple(val(gene_id_1), val(gene_id_2))
+    path fasta_file
+    output:
+    path "${gene_id_1}_${gene_id_2}.cds.fst"
+    script:
+    output_file="${gene_id_1}_${gene_id_2}.cds.fst"
+    """
+    bash "${baseDir}/scripts/extract_fasta_records.sh" "${gene_id_1}" "${gene_id_2}" "${fasta_file}" "${output_file}"
+    """
+}
+
+
+
+process CLUSTALW2 {
+
+    label 'clustalw'
+
+    input:
+    path protein_sequence
+    output:
+    path "${protein_sequence.simpleName}.prot.ali.aln"
+    script:
+    """
+    clustalw2 -quiet -align -infile="${protein_sequence}" -outfile="${protein_sequence.simpleName}.prot.ali.aln"
+    """
+}
+
+
+
+
+process PAL2NAL {
+
+    label 'pal2nal'
+
+    input:
+    path protein_alignment
+    path coding_sequence
+    output:
+    path "${protein_alignment.simpleName}.cds.ali.phy"
+    script:
+    """
+    pal2nal.pl "${protein_alignment}" "${coding_sequence}" -output paml > "${protein_alignment.simpleName}.cds.ali.phy"
+    """
+}
+
+process YN00 {
+
+    label 'paml'
+
+    input:
+    path phylip_file
+    output:
+    path "${phylip_file.simpleName}.yn"
+    script:
+    """
+    echo "seqfile = ${phylip_file} \noutfile = ${phylip_file.simpleName}.yn \nverbose = 0\nicode = 0\nweighting = 0\ncommonf3x4 = 0" > yn00.ctl
+    yn00
+    """
+}
+
+process EXTRACT_KA_KS {
+
+    input:
+    tuple(val(gene_id_1), val(gene_id_2))
+    path yn_file
+    output:
+    path 'csv_row'
+    script:
+    """
+    KaKs=\$(awk '
+    BEGIN {
+          OFS=","
+          on_good_section=0
+          skip=0
+    }
+    \$1 == "(B)" {
+         skip=8
+         on_good_section=1
+    }
+
+    on_good_section == 1 {
+        if (skip == 0) {
+            Ka=\$8
+            Ks=\$11
+            print Ka, Ks
+            exit
+        } else {
+            skip -= 1
+        }
+    }
+    ' "${yn_file}")
+    arr=(\${KaKs//,/ })
+    Ka=\${arr[0]}
+    Ks=\${arr[1]}
+    echo "${gene_id_1}\t${gene_id_2}\t\${Ka}\t\${Ks}" > csv_row
+    """
+}
+
+workflow KA_KS {
+
+    take:
+    gene_id_pair
+    proteome_fasta
+    cds_fasta
+
+    main:
+    protein_sequences = EXTRACT_TWO_PROTEINS(gene_id_pair, proteome_fasta)
+    cds_sequences = EXTRACT_TWO_CDS(gene_id_pair, cds_fasta)
+    protein_alignment = CLUSTALW2(protein_sequences)
+    phylip = PAL2NAL(protein_alignment, cds_sequences)
+    yn = YN00(phylip)
+    kaks = EXTRACT_KA_KS(gene_id_pair, yn)
+
+    emit:
+    kaks = kaks
+}
diff --git a/workflow/KaKs/scripts/extract_fasta_records.sh b/workflow/KaKs/scripts/extract_fasta_records.sh
new file mode 100755
index 0000000..1bbff6c
--- /dev/null
+++ b/workflow/KaKs/scripts/extract_fasta_records.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env sh
+gene_id_1="${1}"
+gene_id_2="${2}"
+fasta_file="${3}"
+output_file="${4}"
+echo "" > "${output_file}"
+for gene_id in ${gene_id_1} ${gene_id_2}; do
+    awk -v gene_id="${gene_id}" '
+        BEGIN {
+            on_gene=0
+        }
+        /^[^>]/ && on_gene == 1 {
+            print $0
+        }
+        /^>/ {
+            gene = $4
+            gsub("gene:", "", gene)
+            if (gene == gene_id) {
+                on_gene=1
+                print $0
+            } else {
+                on_gene=0
+            }
+        }
+        ' "${fasta_file}" >> "${output_file}"
+done