Add a bash script to compute Ka/Ks

2024-12-24 11:31:02 +01:00 · 2024-12-24 11:31:02 +01:00 · 3e3c184153
parent 5322a9686a
commit 3e3c184153
1 changed files with 76 additions and 0 deletions
--- a/workflow/KaKs/ugly.sh
+++ b/workflow/KaKs/ugly.sh
@ -0,0 +1,76 @@
 #!/usr/bin/env bash
 # Bash version of the Nextflow script that should run faster
 output_file="Glycine_max_duplicate_genes_KaKs.tsv"
 proteome_fasta="/home/sortion/Documents/course/master/M2/S1/comparative_genomics/project/tmp/proteome_filtered.fa"
 cds_fasta="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.cds.all.fa"
 set -euo pipefail
 mkdir -p tmp
 echo "seqfile = ./tmp/cds.ali.phy
 outfile = ./tmp/yn
 verbose = 0
 icode = 0
 weighting = 0
 commonf3x4 = 0" > yn00.ctl
 compute_kaks() {
    local gene_id_1
    local gene_id_2
    gene_id_1=$1
    gene_id_2=$2
    # Extract the proteins
    ./scripts/extract_fasta_records.sh $gene_id_1 $gene_id_2 ${proteome_fasta} "./tmp/prot.fst"
    # Extract the CDS
    ./scripts/extract_fasta_records.sh $gene_id_1 $gene_id_2 ${cds_fasta} "./tmp/cds.fst"
    # Run clustalw2
    clustalw2 -quiet -align -infile="./tmp/prot.fst" -outfile="./tmp/prot.ali.aln"
    # Run Pal2Nal
    pal2nal.pl "./tmp/prot.ali.aln" "./tmp/cds.fst" -output paml > "./tmp/cds.ali.phy"
    # Run yn00
    yn00
    # Extract Ka/Ks
    awk '
    BEGIN {
          on_good_section=0
          skip=0
    }
    $1 == "(B)" {
         skip=8
         on_good_section=1
    }
    on_good_section == 1 {
        if (skip == 0) {
            Ka=$8
            Ks=$11
            print Ka, Ks
            exit
        } else {
            skip -= 1
        }
    }
    ' "./tmp/yn"
 }
 echo "gene_a,gene_b,ka,ks" > "${output_file}"
 input="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/Glycine_max_duplicate_gene_pairs.tsv"
 n=$(wc -l "${input}")
 i=0
 while IFS= read -r line
 do
    echo $i / $n
    gene_a=$(echo $line | awk '{print $1}')
    gene_b=$(echo $line | awk '{print $2}')
    i=$(expr $i + 1)
    kaks=$(compute_kaks $gene_a $gene_b | tail -1 | awk 'NF == 2')
    if [[ ! -z $kaks ]]
       then
       arr=(${kaks//\t/ })
       echo "${gene_a},${gene_b},${arr[0]},${arr[1]}" >> "${output_file}"
    fi
 done < "${input}"