diff --git a/notebook.org b/notebook.org new file mode 100644 index 0000000..d7fd400 --- /dev/null +++ b/notebook.org @@ -0,0 +1,1009 @@ +#+title: Comparative Genomics Project +#+subtitle: Duplicate Genes in /Glycine max/ +#+date: 2024-2025 +#+author: Samuel Ortion +#+LATEX_CLASS: scrartcl +#+LATEX_HEADER: \titlehead{M2 GENIOMHE} + +* General infos + +<2024-10-25 Fri> + +** Objectives +*** Part 1 +1. Determine the amount of duplicate genes in my genome. + - What is the proportion of duplicate genes in my genome. + - Give an interval based on two datasets (low stringency dataset and high stringency dataset). + #+caption: High stringency dataset and low stringency dataset duplicate genes filtering criteria. + #+name: dataset-criteria + | | Low | High | + |------------+-------------------------------+-------------------------------| + | Coverage | 30% | 40% | + | %identity | 30% | 50% | + | Bitscore | no filtering | no filtering | + | Clustering | mcl (with default parameters) | mcl (with default parameters) | + + #+name: low-stringency + #+begin_src bash +min_coverage=30 +min_identity=30 + #+end_src + + #+name: high-stringency + #+begin_src bash +min_coverage=40 +min_identity=50 + #+end_src +2. Determine the amount of Tandemly Arrayed Genes (TAGs) in the genome (definition 0 and 1). +3. Determine the amount in plant's chloroplasts and mitochondria. +4. Is there a lot of duplicated operons in plants? + +*** Part 2 +1. Compute \(K_s\) values and use it as a proxy of the age of the duplication. +2. Are TAGs younger than other duplicate genes? + + +** Our species + +/Glycine max/ is the soy plant. + +* Download the source data + +https://plants.ensembl.org/Glycine_max/Info/Index + +https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-60/fasta/glycine_max/pep/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz + + +* Develop the Nextflow workflow + +** Filtering the proteome to keep the longest isoform + +I used three awk families_and_TAGs/scripts to filter the FASTA file to keep only the longest isoforms: + 1. =protein_lengths.awk= associate each FASTA record with the length of its sequence, + 2. =filter_longest.awk= keep the isoform ID that has the maximum length, + 3. =filter_records.awk= based on the list of isoform ID, retrieve the sequence of from the original file. + +=protein_lengths.awk= + +#+include: workflow/families_and_TAGs/scripts/protein_lengths.awk src awk + +=filter_longest.awk= + +#+include: workflow/families_and_TAGs/scripts/filter_longest.awk src awk + +=filter_records.awk= + +#+include: workflow/families_and_TAGs/scripts/filter_records_fasta.awk src awk + +How many protein coding gene are there? +#+name: protein-coding-gene-count +#+begin_src bash :export both +zcat ../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz \ + | grep -v 'supercontig' \ + | awk ' /^>/ { print $4 }' | sort | uniq | wc -l +#+end_src + +#+RESULTS: protein-coding-gene-count +: 55589 + +#+RESULTS: +: 55897 + +** Filtering out Chloroplasts and Mitochondria proteins + +From the Ensembl Plants index page we can see that there is no chloroplast nor mitochondria proteins in the sequenced genome of /Glycine max/. + +** Filter out supercontigs + +<2024-10-28 Mon> + +#+include: workflow/families_and_TAGs/scripts/remove_supercontigs.awk src awk + +#+begin_src bash +zcat ../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz \ + | awk -f ./workflow/families_and_TAGs/scripts/remove_supercontigs.awk \ + > ./tmp/Glycine_max.Glycine_max_v2.1.pep.all.nosupercontig.fa +#+end_src + +#+RESULTS: + +/Sanity check:/ Has the script removed all supercontigs proteins and only them? + +/How many records in supercontigs were there?/ +#+begin_src bash :export both +zcat ../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz \ + | awk -F' ' ' /^>/ { print $3}' | grep -c "^supercontig" +#+end_src + +#+RESULTS: +: 435 + +/How many records have been removed?/ + +#+name: all-isoforms +#+begin_src bash +zgrep -c "^>" ../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz +#+end_src + +#+RESULTS: all-isoforms +: 88412 + +#+name: all-isoforms-no-supercontig +#+begin_src bash +grep -c "^>" ./tmp/Glycine_max.Glycine_max_v2.1.pep.all.nosupercontig.fa +#+end_src + +#+RESULTS: all-isoforms-no-supercontig +: 87977 + +#+begin_src bash :var ALL_ISOFORMS=all-isoforms :var NO_SUPERCONTIGS=all-isoforms-no-supercontig :exports both +expr $ALL_ISOFORMS - $NO_SUPERCONTIGS +#+end_src + +#+RESULTS: +: 435 + +** BLASTp All Against All + +#+begin_src bash +makeblastdb -in "$proteome" -dbtype prot -out 'tmp/blastdb' -title "Glycine max" +#+end_src + +#+begin_src bash +blastp -query "$proteome" -db 'tmp/blastdb' -outfmt 6 -out "tmp/Glycine_max.blastp.tsv" +#+end_src + +** Clustering + +Using =mcl=. + +1. Extract the homology in ABC format + #+begin_src bash +awk 'BEGIN { OFS="\t" } { print \$14, \$16, \$12 }' "${blastp}" > 'graph.abc' + #+end_src +2. Run =mcl= with default parameters + #+begin_src bash +mcl 'graph.abc' --abc -o 'custering.mcl' + #+end_src +3. Convert =mcl= output to TSV with two columns =gene id=, =family id=. + #+include: workflow/families_and_TAGs/scripts/mcl_to_tsv.awk src awk + + #+begin_src bash +awk -f workflow/families_and_TAGs/scripts/mcl_to_tsv.awk 'clustering.mcl' > 'families.tsv' + #+end_src + + +** Running the Nextflow workflow + +Update the parameters in =nextflow.config= and run +#+begin_src bash +nextflow run main.nf -profile conda -resume +#+end_src + +from the =./workflow/= folder + + +* From the given BLASTp format 6 file +<2024-11-02 Sat> +/How many BLASTp HSP have been reported?/ +#+begin_src bash +cat ../data/Glycine_max_Blastp_longIsoforme | wc -l +#+end_src + +#+RESULTS: +: 4456730 + +** Remove HSP on proteins found on supercontigs + +Extract the isoform ID whose gene is on supercontigs. +#+begin_src bash +zcat ../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz| awk -F' ' '/^>/ && $3 ~ /^supercontig/ { sequence_id = $1; gsub(">", "", sequence_id); print sequence_id }' > ./tmp/protein_on_supercontigs.list +#+end_src + +#+RESULTS: + +#+begin_src bash +cat ./tmp/protein_on_supercontigs.list | wc -l +#+end_src + +#+RESULTS: +: 435 + +Remove HSP with subject or query being on a supercontig + +=filter_blastp_sequence_id.awk= + +#+include: workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk src awk + +#+begin_src bash +awk -f workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk ./tmp/protein_on_supercontigs.list ../data/Glycine_max_Blastp_longIsoforme > ./tmp/Glycine_max_Blastp_longIsoforme_nosupercontig +#+end_src + +#+RESULTS: + +How many HSP have been removed? + +#+begin_src bash +length() { + cat $1 | wc -l + } +expr $(length ../data/Glycine_max_Blastp_longIsoforme) - $(length ./tmp/Glycine_max_Blastp_longIsoforme_nosupercontig) +#+end_src + +#+RESULTS: +: 36598 + +#+begin_src bash +cat ./tmp/Glycine_max_Blastp_longIsoforme_nosupercontig | wc -l +#+end_src + +#+RESULTS: +: 4420132 + + +** Filter by coverage and identity percentage + +*** Add protein length and gene name columns + +**** Extract the protein length and gene id from the FASTA file + + +#+begin_src bash +zcat ../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz | awk -f ./workflow/families_and_TAGs/scripts/protein_lengths.awk > ./tmp/Glycine_max_protein_length_gene.tsv +#+end_src + +#+RESULTS: + + +**** Join the BLASTp HSP file with the gene id and gene lengths + +#+begin_src bash +protein_length=./tmp/Glycine_max_protein_length_gene.tsv +blastp=./tmp/Glycine_max_Blastp_longIsoforme_nosupercontig +sort -k 1 "${blastp}" > tmp/blastp_s +sort -k 1 "${protein_length}" > tmp/protein_length_s +join -1 1 -2 1 -t $'\t' tmp/blastp_s tmp/protein_length_s > tmp/join1.tsv +sort -k 2 tmp/join1.tsv > tmp/join1.tsv_s +join -1 2 -2 1 -t $'\t' tmp/join1.tsv_s tmp/protein_length_s > tmp/joined.blastp.tsv +#+end_src + +#+RESULTS: + +**** Filter the BLASTp according to coverage and identity percentage + +#+name: filter-blastp +#+begin_src bash +blastp=./tmp/joined.blastp.tsv +awk -f "workflow/families_and_TAGs/scripts/filter_blastp.awk" \ + -v min_coverage="${min_coverage}" \ + -v min_identity="${min_identity}" \ + "${blastp}" > "tmp/Glycine_max_Blastp_filtered_coverage${min_coverage}_identity${min_identity}.tsv" +#+end_src + +#+RESULTS: filter-blastp + +***** First dataset + +#+begin_src bash :noweb strip-export +<> +<> +#+end_src + +#+RESULTS: + +#+begin_src bash :noweb strip-export +<> +<> +#+end_src + +#+RESULTS: + +#+begin_src bash +cat ./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.tsv | wc -l +#+end_src + +#+RESULTS: +: 1520850 + +#+begin_src bash +cat ./tmp/Glycine_max_Blastp_filtered_coverage40_identity50.tsv | wc -l +#+end_src + +#+RESULTS: +: 342180 + +** Clustering +<2024-11-03 Sun> +*** Extract the homology graph + +#+name: extract-abc +#+begin_src bash +blastp="tmp/Glycine_max_Blastp_filtered_coverage${min_coverage}_identity${min_identity}.tsv" +abc="tmp/Glycine_max_Blastp_filtered_coverage${min_coverage}_identity${min_identity}.abc" +awk 'BEGIN { FS = OFS = "\t" } { print $14, $16, $12 }' "$blastp" > "$abc" +#+end_src + + +#+begin_src bash :noweb strip-export +<> +<> +#+end_src + +#+RESULTS: + +#+begin_src bash :noweb strip-export +<> +<> +#+end_src + +#+RESULTS: + +#+begin_src bash +head -n 1 "tmp/Glycine_max_Blastp_filtered_coverage30_identity30.abc" +#+end_src + +#+RESULTS: +| GLYMA_10G098500 | GLYMA_U008600 | 154 | + +*** Filter the homology graph to keep only the heaviest weight + + +#+include: workflow/families_and_TAGs/scripts/keep_heaviest_edge_abc.awk src awk + +#+name: filter-abc +#+begin_src bash +original_abc="tmp/Glycine_max_Blastp_filtered_coverage${min_coverage}_identity${min_identity}.abc" +filtered_abc="tmp/Glycine_max_Blastp_filtered_coverage${min_coverage}_identity${min_identity}_f.abc" +awk -f "workflow/families_and_TAGs/scripts/keep_heaviest_edge_abc.awk" "$original_abc" > "$filtered_abc" +#+end_src + +#+begin_src bash :noweb strip-export +<> +<> +#+end_src + +#+RESULTS: + +#+begin_src bash :noweb strip-export +<> +<> +#+end_src + +#+RESULTS: + +*** Run Markov Clustering + +#+name: mcl +#+begin_src bash +abc="tmp/Glycine_max_Blastp_filtered_coverage${min_coverage}_identity${min_identity}_f.abc" +mcl="tmp/Glycine_max_Blastp_filtered_coverage${min_coverage}_identity${min_identity}.mcl" +mcl "${abc}" --abc --discard-loops=yes -o "$mcl" +#+end_src + +#+RESULTS: mcl + +#+begin_src bash :noweb strip-export +<> +<> +#+end_src + +#+RESULTS: + +#+begin_src bash :noweb strip-export +<> +<> +#+end_src + +#+RESULTS: + +*** Transform MCL output into a gene to family mapping TSV file + +#+name: mcl-to-tsv +#+begin_src bash +mcl="tmp/Glycine_max_Blastp_filtered_coverage${min_coverage}_identity${min_identity}.mcl" +tsv="tmp/Glycine_max_Blastp_filtered_coverage${min_coverage}_identity${min_identity}.mcl.tsv" +awk -f "workflow/families_and_TAGs/scripts/mcl_to_tsv.awk" "${mcl}" > "${tsv}" +#+end_src + +#+begin_src bash :noweb strip-export +<> +<> +#+end_src + +#+RESULTS: + +#+begin_src bash :noweb strip-export +<> +<> +#+end_src + +#+RESULTS: + +#+name: duplicate-genes-count-high-stringency +#+begin_src bash +tsv="tmp/Glycine_max_Blastp_filtered_coverage40_identity50.mcl.tsv" +cat "${tsv}" | wc -l +#+end_src + +#+RESULTS: duplicate-genes-count-high-stringency +: 46769 + +#+name: duplicate-genes-count-low-stringency +#+begin_src bash +tsv="tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv" +cat "${tsv}" | wc -l +#+end_src + +#+RESULTS: duplicate-genes-count-low-stringency +: 50254 + +#+begin_src bash +tsv="tmp/Glycine_max_Blastp_filtered_coverage40_identity50.mcl.tsv" +head "${tsv}" +#+end_src + +#+RESULTS: +| GLYMA_12G034500 | 1 | +| GLYMA_12G138300 | 1 | +| GLYMA_12G124600 | 1 | +| GLYMA_12G126700 | 1 | +| GLYMA_12G125200 | 1 | +| GLYMA_12G034900 | 1 | +| GLYMA_U036400 | 1 | +| GLYMA_12G124200 | 1 | +| GLYMA_12G126300 | 1 | +| GLYMA_12G035100 | 1 | + +#+begin_src bash +tsv="tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv" +head "${tsv}" +#+end_src + +#+RESULTS: +| GLYMA_07G053000 | 1 | +| GLYMA_10G247700 | 1 | +| GLYMA_12G188000 | 1 | +| GLYMA_07G159200 | 1 | +| GLYMA_13G313300 | 1 | +| GLYMA_15G016600 | 1 | +| GLYMA_14G159200 | 1 | +| GLYMA_18G287500 | 1 | +| GLYMA_20G132900 | 1 | +| GLYMA_16G204800 | 1 | + +What is the interval of duplicate gene count? There are between 48109 and 49391 duplicate genes in /Glycine max/. + +What is the number of families? + +#+begin_src bash +tsv="tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv" +cut -f2 "${tsv}" | sort | uniq | wc -l +#+end_src + +#+RESULTS: +: 8426 + +#+begin_src bash +tsv="tmp/Glycine_max_Blastp_filtered_coverage40_identity50.mcl.tsv" +cut -f2 "${tsv}" | sort | uniq | wc -l +#+end_src + +#+RESULTS: +: 11997 + +What is the number of singletons? + +#+name: singletons-low-stringency +#+begin_src bash :var ALL=protein-coding-gene-count :var DUPLICATED=duplicate-genes-count-low-stringency +expr $ALL - $DUPLICATED +#+end_src + +#+RESULTS: singletons-low-stringency +: 5643 + +#+name: singletons-high-stringency +#+begin_src bash :var ALL=protein-coding-gene-count :var DUPLICATED=duplicate-genes-count-high-stringency +expr $ALL - $DUPLICATED +#+end_src + +#+RESULTS: singletons-high-stringency +: 9128 + +* Amount of Tandemly Arrayed Genes + +*** Extract gene position from a GFF3 file + +#+include: workflow/families_and_TAGs/scripts/gff3_to_gene_positions_table.awk src awk + +#+begin_src bash +zcat "../data/Glycine_max.Glycine_max_v2.1.60.chr.gff3.gz" \ + | awk -f "./workflow/families_and_TAGs/scripts/gff3_to_gene_positions_table.awk" \ + > "./tmp/Glycine_max.gene_positions.tsv" +#+end_src + +#+RESULTS: + +#+begin_src bash +head "./tmp/Glycine_max.gene_positions.tsv" +#+end_src + +#+RESULTS: +| GLYMA_01G000100 | 1 | 27355 | 28320 | +| GLYMA_01G000200 | 1 | 58975 | 67527 | +| GLYMA_01G000300 | 1 | 67770 | 69968 | +| GLYMA_01G000400 | 1 | 90152 | 95947 | +| GLYMA_01G000500 | 1 | 90289 | 91197 | +| GLYMA_01G000600 | 1 | 116094 | 127845 | +| GLYMA_01G000700 | 1 | 143467 | 155573 | +| GLYMA_01G000800 | 1 | 157030 | 157772 | +| GLYMA_01G000900 | 1 | 170534 | 193342 | +| GLYMA_01G001000 | 1 | 196256 | 201895 | + +#+name: gene-positions-count +#+begin_src bash +cat "./tmp/Glycine_max.gene_positions.tsv" \ + | wc -l +#+end_src + +#+RESULTS: +: 55589 + +There is 55897 protein coding genes in the sequence file, but only 55589 gene positions. + +#+begin_src bash :var POSITIONS=gene-positions-count :var GENES=protein-coding-gene-count :exports both +expr $GENES - $POSITIONS +#+end_src + +#+RESULTS: +: 308 + +*** Extract TAGs + +#+name: extract-TAGs +#+begin_src bash +positions="./tmp/Glycine_max.gene_positions.tsv" +families="./tmp/Glycine_max_Blastp_filtered_coverage${min_coverage}_identity${min_identity}.mcl.tsv" +./rust/tagfinder/target/release/tagfinder \ + --families "$families" \ + --positions "$positions" \ + --definitions 0,1 \ + > "./tmp/Glycine_max_TAGs_coverage${min_coverage}_identity${min_identity}.tsv" +#+end_src + +#+RESULTS: extract-TAGs + +#+begin_src bash :noweb strip-export +<> +<> +#+end_src + +#+RESULTS: + +#+begin_src bash :noweb strip-export +<> +<> +#+end_src + +#+RESULTS: + +# LocalWords: operons + +#+begin_src bash +head "./tmp/Glycine_max_TAGs_coverage30_identity30.tsv" +#+end_src + +#+RESULTS: +| gene | family | tag0 | tag1 | +| GLYMA_07G257400 | 3109 | - | - | +| GLYMA_17G203900 | 2067 | - | - | +| GLYMA_20G062000 | 509 | - | - | +| GLYMA_16G099400 | 40 | - | - | +| GLYMA_14G045400 | 1136 | 804 | 896 | +| GLYMA_11G189200 | 15 | - | - | +| GLYMA_02G191500 | 1310 | - | - | +| GLYMA_02G308200 | 520 | - | - | +| GLYMA_14G205200 | 8 | - | - | + +How many TAGs for definition 0? + +#+begin_src bash +awk 'BEGIN { + max=0 + } + NR > 1 { + if ($3 > max) { + max=$3 + } + } + END { + print max + } + ' "./tmp/Glycine_max_TAGs_coverage30_identity30.tsv" +#+end_src + +#+RESULTS: +: 3208 + +For TAG1? + +#+begin_src bash +awk 'BEGIN { + max=0 + } + NR > 1 { + if ($4 > max) { + max=$4 + } + } + END { + print max + } + ' "./tmp/Glycine_max_TAGs_coverage30_identity30.tsv" +#+end_src + +#+RESULTS: +: 3481 + + +#+begin_src bash +awk 'BEGIN { + max=0 + } + NR > 1 { + if ($3 > max) { + max=$3 + } + } + END { + print max + } + ' "./tmp/Glycine_max_TAGs_coverage40_identity50.tsv" +#+end_src + +#+RESULTS: +: 2500 + + +#+begin_src bash +awk 'BEGIN { + max=0 + } + NR > 1 { + if ($4 > max) { + max=$4 + } + } + END { + print max + } + ' "./tmp/Glycine_max_TAGs_coverage40_identity50.tsv" +#+end_src + +#+RESULTS: +: 2652 + + +*** Size of the greatest TAG + +**** High stringency + +/TAG₀/ + +#+begin_src bash +awk ' + { + tag_index=$3 + if (tag_index != "-") + gene_count[tag_index]++ + } + END { + max = 0 + max_tag_index = "" + for (tag_index in gene_count) { + if (gene_count[tag_index] > max) { + max_tag_index = tag_index + max = gene_count[tag_index] + } + } + print max_tag_index, max + }' "./tmp/Glycine_max_TAGs_coverage40_identity50.tsv" +#+end_src + +#+RESULTS: +: 421 53 + +/TAG₁/ + +#+begin_src bash +awk ' + { + tag_index=$4 + if (tag_index != "-") + gene_count[tag_index]++ + } + END { + max = 0 + max_tag_index = "" + for (tag_index in gene_count) { + if (gene_count[tag_index] > max) { + max_tag_index = tag_index + max = gene_count[tag_index] + } + } + print max_tag_index, max + }' "./tmp/Glycine_max_TAGs_coverage40_identity50.tsv" +#+end_src + +#+RESULTS: +: 459 76 + +**** Low stringency + + +/TAG₀/ + +#+begin_src bash +awk ' + { + tag_index=$3 + if (tag_index != "-") + gene_count[tag_index]++ + } + END { + max = 0 + max_tag_index = "" + for (tag_index in gene_count) { + if (gene_count[tag_index] > max) { + max_tag_index = tag_index + max = gene_count[tag_index] + } + } + print max_tag_index, max + }' "./tmp/Glycine_max_TAGs_coverage30_identity30.tsv" +#+end_src + +#+RESULTS: +: 579 76 + +/TAG₁/ + +#+begin_src bash +awk ' + { + tag_index=$4 + if (tag_index != "-") + gene_count[tag_index]++ + } + END { + max = 0 + max_tag_index = "" + for (tag_index in gene_count) { + if (gene_count[tag_index] > max) { + max_tag_index = tag_index + max = gene_count[tag_index] + } + } + print max_tag_index, max + }' "./tmp/Glycine_max_TAGs_coverage30_identity30.tsv" +#+end_src + +#+RESULTS: +: 646 76 + +* DONE Complete slides +<2024-11-23 Sat> + +* Family sizes + +#+begin_src bash +head -1 ./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl \ + | awk '{ print NF }' +#+end_src + +#+RESULTS: +: 437 + +#+begin_src bash +head -1 ./tmp/Glycine_max_Blastp_filtered_coverage40_identity50.mcl \ + | awk '{ print NF }' +#+end_src + +#+RESULTS: +: 159 + + +* TODO $K_s$ For all duplicated pairs within a family + +Let $A$ be a family of $n$ duplicate members. The number of duplicate pairs is $n(n-1) / 2$. + +PAML package: +#+begin_src bash +conda install paml pal2nal clustalw -c bioconda +#+end_src + +** Use a Nextflow workflow +<2024-12-22 Sun> + +*** All pair of duplicate genes + +Generated by a rust program in =rust/pairs= folder. +The algorithm is simple. +We assume the gene-families mapping file is sorted by family index. +For each family, loop for i from 0 to the size of the family minus one, loop for j from i plus one to the size of the family and output the pair gene i - gene j. + + +Example: + +#+begin_src bash +cat ./tmp/Glycine_max_Blastp_filtered_coverage30_identity30.mcl.tsv | ./rust/pairs/target/release/pairs > ./tmp/Glycine_max_duplicate_gene_pairs.tsv +head ./tmp/Glycine_max_duplicate_gene_pairs.tsv +#+end_src + +#+RESULTS: +| GLYMA_07G053000 | GLYMA_10G247700 | +| GLYMA_07G053000 | GLYMA_12G188000 | +| GLYMA_07G053000 | GLYMA_07G159200 | +| GLYMA_07G053000 | GLYMA_13G313300 | +| GLYMA_07G053000 | GLYMA_15G016600 | +| GLYMA_07G053000 | GLYMA_14G159200 | +| GLYMA_07G053000 | GLYMA_18G287500 | +| GLYMA_07G053000 | GLYMA_20G132900 | +| GLYMA_07G053000 | GLYMA_16G204800 | +| GLYMA_07G053000 | GLYMA_08G302300 | + +*** PAML Steps +Example for the first gene pair. + +/Extract the CDS sequence and protein sequence of the genes/ + +#+name: pair-gene-id +#+begin_src bash +gene_id_1="GLYMA_07G053000" +gene_id_2="GLYMA_10G247700" +#+end_src + +CDS: + + +#+name: pair-gene-cds-extract +#+begin_src bash :noweb strip-export +output_file="./tmp/${gene_id_1}_${gene_id_2}.cds.fst" +echo "" > "${output_file}" +for gene_id in "${gene_id_1}" "${gene_id_2}"; do + zcat ../data/Glycine_max.Glycine_max_v2.1.cds.all.fa.gz | awk -v gene_id="${gene_id}" ' + BEGIN { + on_gene=0 + } + /^[^>]/ && on_gene == 1 { + print $0 + } + /^>/ { + gene = $4 + gsub("gene:", "", gene) + if (gene == gene_id) { + on_gene=1 + print $0 + } else { + on_gene=0 + } + } + ' >> "${output_file}" +done +#+end_src + +#+RESULTS: pair-gene-cds-extract + +#+begin_src bash :noweb strip-export +<> +<> +#+end_src + +#+RESULTS: + +Proteins: + +#+name: pair-gene-protein-extract +#+begin_src bash :noweb strip-export +output_file="./tmp/${gene_id_1}_${gene_id_2}.prot.fst" +echo "" > "${output_file}" +for gene_id in "${gene_id_1}" "${gene_id_2}"; do + cat ./tmp/proteome_filtered.fa | awk -v gene_id="${gene_id}" ' + BEGIN { + on_gene=0 + } + /^[^>]/ && on_gene == 1 { + print $0 + } + /^>/ { + gene = $4 + gsub("gene:", "", gene) + if (gene == gene_id) { + on_gene=1 + print $0 + } else { + on_gene=0 + } + } + ' >> "${output_file}" +done +#+end_src + +#+RESULTS: pair-gene-protein-extract + + +#+begin_src bash :noweb strip-export +<> +<> +#+end_src + +#+RESULTS: + +/Clustalw2 alignment/ + +#+begin_src bash :noweb strip-export +<> +protein_sequence="./tmp/${gene_id_1}_${gene_id_2}.prot.fst" +clustalw2 -quiet -align -infile="${protein_sequence}" -outfile="./tmp/${gene_id_1}_${gene_id_2}.prot.ali.aln" +#+end_src + +#+RESULTS: + +/Convert protein alignment to coding sequence alignment/ +#+begin_src bash :noweb strip-export +<> +protein_alignment="./tmp/${gene_id_1}_${gene_id_2}.prot.ali.aln" +coding_sequence="./tmp/${gene_id_1}_${gene_id_2}.cds.fst" +pal2nal "${protein_alignment}" "${coding_sequence}" -output paml > "./tmp/${gene_id_1}_${gene_id_2}.cds.ali.phy" +#+end_src + + +/YN00 Ka-Ks computation/ + +#+begin_src bash :noweb strip-export +<> +phylip_file="./tmp/${gene_id_1}_${gene_id_2}.cds.ali.phy" +echo "seqfile = ${phylip_file}\noutfile = ${phylip_file}.yn\nverbose = 0\nicode = 0\nweighting = 0\ncommonf3x4 = 0" > yn00.ctl +yn00 +#+end_src + +/Extraction of Ka-Ks values/ + +#+begin_src bash :noweb strip-export +<> +yn_file="./tmp/${gene_id_1}_${gene_id_2}.cds.ali.phy.yn" +awk ' + BEGIN { + on_good_section=0 + skip=0 + } + $1 == "(B)" { + skip=8 + on_good_section=1 + } + + on_good_section == 1 { + if (skip == 0) { + Ka=$8 + Ks=$11 + print Ka, Ks + exit + } else { + skip -= 1 + } + } + ' "${yn_file}" +#+end_src + +#+RESULTS: +: 0.6207 1.9409 + +*** Run the Nextflow workflow +<2024-12-23 Mon> + + +* TODO Does /Glycine max/ have big TAGs and which function are the big TAG gene implied? + + +* TODO Are TAG pairs different in age from non-TAG pairs? + +* TODO Are genes inside a TAG orientated in the same way more often than random? diff --git a/workflow/KaKs/conda/clustalw.yaml b/workflow/KaKs/conda/clustalw.yaml new file mode 100644 index 0000000..05a370b --- /dev/null +++ b/workflow/KaKs/conda/clustalw.yaml @@ -0,0 +1,6 @@ +name: clustalw +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::clustalw diff --git a/workflow/KaKs/conda/pal2nal.yaml b/workflow/KaKs/conda/pal2nal.yaml new file mode 100644 index 0000000..4370ce2 --- /dev/null +++ b/workflow/KaKs/conda/pal2nal.yaml @@ -0,0 +1,6 @@ +name: pal2nal +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::pal2nal diff --git a/workflow/KaKs/conda/paml.yaml b/workflow/KaKs/conda/paml.yaml new file mode 100644 index 0000000..3604566 --- /dev/null +++ b/workflow/KaKs/conda/paml.yaml @@ -0,0 +1,6 @@ +name: paml +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::paml diff --git a/workflow/KaKs/main.nf b/workflow/KaKs/main.nf new file mode 100644 index 0000000..196d970 --- /dev/null +++ b/workflow/KaKs/main.nf @@ -0,0 +1,39 @@ +/** Compute Ka, Ks values for each +/** pair of duplicate genes in a TAG +/** and in a gene family +/** */ + +include { GUNZIP as GUNZIP_CDS } from "./modules/gunzip.nf" +include { KA_KS } from "./modules/ka_ks.nf" + +process GENE_PAIRS { + input: + path families + output: + path 'pairs' + script: + """ + cat "${families}" | ${baseDir}/../../rust/pairs/target/release/pairs > 'pairs' + """ +} + +workflow { + families = params.families + cds_gz = params.cds + proteome = params.proteome + + cds = GUNZIP_CDS(cds_gz) + + pairs = GENE_PAIRS(families) + .splitCsv(sep: '\t', header: false) + .map { row -> tuple(row[0], row[1]) } + + + kaks = KA_KS(pairs, proteome, cds) + + // Save to a CSV with collectFile + header = Channel.value("gene_id_1\tgene_id_2\tKa\tKs") + header.concat( kaks ) + .collectFile( name: 'test.txt', newLine: true, sort: false ) + .view() +} diff --git a/workflow/KaKs/modules/gunzip.nf b/workflow/KaKs/modules/gunzip.nf new file mode 100644 index 0000000..2c68cf6 --- /dev/null +++ b/workflow/KaKs/modules/gunzip.nf @@ -0,0 +1,12 @@ +process GUNZIP { + input: + path zipped_file + + output: + path zipped_file.baseName + + script: + """ + gunzip -c "${zipped_file}" > "${zipped_file.baseName}" + """ +} diff --git a/workflow/KaKs/modules/ka_ks.nf b/workflow/KaKs/modules/ka_ks.nf new file mode 100644 index 0000000..481b3e8 --- /dev/null +++ b/workflow/KaKs/modules/ka_ks.nf @@ -0,0 +1,132 @@ + +process EXTRACT_TWO_PROTEINS { + input: + tuple(val(gene_id_1), val(gene_id_2)) + path fasta_file + output: + path "${gene_id_1}_${gene_id_2}.prot.fst" + script: + output_file="${gene_id_1}_${gene_id_2}.prot.fst" + """ + bash "${baseDir}/scripts/extract_fasta_records.sh" "${gene_id_1}" "${gene_id_2}" "${fasta_file}" "${output_file}" + """ +} + +process EXTRACT_TWO_CDS { + input: + tuple(val(gene_id_1), val(gene_id_2)) + path fasta_file + output: + path "${gene_id_1}_${gene_id_2}.cds.fst" + script: + output_file="${gene_id_1}_${gene_id_2}.cds.fst" + """ + bash "${baseDir}/scripts/extract_fasta_records.sh" "${gene_id_1}" "${gene_id_2}" "${fasta_file}" "${output_file}" + """ +} + + + +process CLUSTALW2 { + + label 'clustalw' + + input: + path protein_sequence + output: + path "${protein_sequence.simpleName}.prot.ali.aln" + script: + """ + clustalw2 -quiet -align -infile="${protein_sequence}" -outfile="${protein_sequence.simpleName}.prot.ali.aln" + """ +} + + + + +process PAL2NAL { + + label 'pal2nal' + + input: + path protein_alignment + path coding_sequence + output: + path "${protein_alignment.simpleName}.cds.ali.phy" + script: + """ + pal2nal.pl "${protein_alignment}" "${coding_sequence}" -output paml > "${protein_alignment.simpleName}.cds.ali.phy" + """ +} + +process YN00 { + + label 'paml' + + input: + path phylip_file + output: + path "${phylip_file.simpleName}.yn" + script: + """ + echo "seqfile = ${phylip_file} \noutfile = ${phylip_file.simpleName}.yn \nverbose = 0\nicode = 0\nweighting = 0\ncommonf3x4 = 0" > yn00.ctl + yn00 + """ +} + +process EXTRACT_KA_KS { + + input: + tuple(val(gene_id_1), val(gene_id_2)) + path yn_file + output: + path 'csv_row' + script: + """ + KaKs=\$(awk ' + BEGIN { + OFS="," + on_good_section=0 + skip=0 + } + \$1 == "(B)" { + skip=8 + on_good_section=1 + } + + on_good_section == 1 { + if (skip == 0) { + Ka=\$8 + Ks=\$11 + print Ka, Ks + exit + } else { + skip -= 1 + } + } + ' "${yn_file}") + arr=(\${KaKs//,/ }) + Ka=\${arr[0]} + Ks=\${arr[1]} + echo "${gene_id_1}\t${gene_id_2}\t\${Ka}\t\${Ks}" > csv_row + """ +} + +workflow KA_KS { + + take: + gene_id_pair + proteome_fasta + cds_fasta + + main: + protein_sequences = EXTRACT_TWO_PROTEINS(gene_id_pair, proteome_fasta) + cds_sequences = EXTRACT_TWO_CDS(gene_id_pair, cds_fasta) + protein_alignment = CLUSTALW2(protein_sequences) + phylip = PAL2NAL(protein_alignment, cds_sequences) + yn = YN00(phylip) + kaks = EXTRACT_KA_KS(gene_id_pair, yn) + + emit: + kaks = kaks +} diff --git a/workflow/KaKs/scripts/extract_fasta_records.sh b/workflow/KaKs/scripts/extract_fasta_records.sh new file mode 100755 index 0000000..1bbff6c --- /dev/null +++ b/workflow/KaKs/scripts/extract_fasta_records.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env sh +gene_id_1="${1}" +gene_id_2="${2}" +fasta_file="${3}" +output_file="${4}" +echo "" > "${output_file}" +for gene_id in ${gene_id_1} ${gene_id_2}; do + awk -v gene_id="${gene_id}" ' + BEGIN { + on_gene=0 + } + /^[^>]/ && on_gene == 1 { + print $0 + } + /^>/ { + gene = $4 + gsub("gene:", "", gene) + if (gene == gene_id) { + on_gene=1 + print $0 + } else { + on_gene=0 + } + } + ' "${fasta_file}" >> "${output_file}" +done