From 3e3c184153cf384dbc87b05308fa1a8a23347e5b Mon Sep 17 00:00:00 2001 From: Samuel Ortion Date: Tue, 24 Dec 2024 11:31:02 +0100 Subject: [PATCH] Add a bash script to compute Ka/Ks --- workflow/KaKs/ugly.sh | 76 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 workflow/KaKs/ugly.sh diff --git a/workflow/KaKs/ugly.sh b/workflow/KaKs/ugly.sh new file mode 100644 index 0000000..e1ff4e8 --- /dev/null +++ b/workflow/KaKs/ugly.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash + +# Bash version of the Nextflow script that should run faster +output_file="Glycine_max_duplicate_genes_KaKs.tsv" +proteome_fasta="/home/sortion/Documents/course/master/M2/S1/comparative_genomics/project/tmp/proteome_filtered.fa" +cds_fasta="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.cds.all.fa" + +set -euo pipefail + +mkdir -p tmp + +echo "seqfile = ./tmp/cds.ali.phy +outfile = ./tmp/yn +verbose = 0 +icode = 0 +weighting = 0 +commonf3x4 = 0" > yn00.ctl +compute_kaks() { + local gene_id_1 + local gene_id_2 + gene_id_1=$1 + gene_id_2=$2 + # Extract the proteins + ./scripts/extract_fasta_records.sh $gene_id_1 $gene_id_2 ${proteome_fasta} "./tmp/prot.fst" + # Extract the CDS + ./scripts/extract_fasta_records.sh $gene_id_1 $gene_id_2 ${cds_fasta} "./tmp/cds.fst" + + # Run clustalw2 + clustalw2 -quiet -align -infile="./tmp/prot.fst" -outfile="./tmp/prot.ali.aln" + + # Run Pal2Nal + pal2nal.pl "./tmp/prot.ali.aln" "./tmp/cds.fst" -output paml > "./tmp/cds.ali.phy" + + # Run yn00 + yn00 + + # Extract Ka/Ks + awk ' + BEGIN { + on_good_section=0 + skip=0 + } + $1 == "(B)" { + skip=8 + on_good_section=1 + } + + on_good_section == 1 { + if (skip == 0) { + Ka=$8 + Ks=$11 + print Ka, Ks + exit + } else { + skip -= 1 + } + } + ' "./tmp/yn" +} +echo "gene_a,gene_b,ka,ks" > "${output_file}" +input="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/Glycine_max_duplicate_gene_pairs.tsv" +n=$(wc -l "${input}") +i=0 +while IFS= read -r line +do + echo $i / $n + gene_a=$(echo $line | awk '{print $1}') + gene_b=$(echo $line | awk '{print $2}') + i=$(expr $i + 1) + kaks=$(compute_kaks $gene_a $gene_b | tail -1 | awk 'NF == 2') + if [[ ! -z $kaks ]] + then + arr=(${kaks//\t/ }) + echo "${gene_a},${gene_b},${arr[0]},${arr[1]}" >> "${output_file}" + fi +done < "${input}"