#!/usr/bin/env bash # Bash version of the Nextflow script that should run faster output_file="Glycine_max_duplicate_genes_KaKs.tsv" proteome_fasta="/home/sortion/Documents/course/master/M2/S1/comparative_genomics/project/tmp/proteome_filtered.fa" cds_fasta="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.cds.all.fa" set -euo pipefail mkdir -p tmp echo "seqfile = ./tmp/cds.ali.phy outfile = ./tmp/yn verbose = 0 icode = 0 weighting = 0 commonf3x4 = 0" > yn00.ctl compute_kaks() { local gene_id_1 local gene_id_2 gene_id_1=$1 gene_id_2=$2 # Extract the proteins ./scripts/extract_fasta_records.sh $gene_id_1 $gene_id_2 ${proteome_fasta} "./tmp/prot.fst" # Extract the CDS ./scripts/extract_fasta_records.sh $gene_id_1 $gene_id_2 ${cds_fasta} "./tmp/cds.fst" # Run clustalw2 clustalw2 -quiet -align -infile="./tmp/prot.fst" -outfile="./tmp/prot.ali.aln" # Run Pal2Nal pal2nal.pl "./tmp/prot.ali.aln" "./tmp/cds.fst" -output paml > "./tmp/cds.ali.phy" # Run yn00 yn00 # Extract Ka/Ks awk ' BEGIN { on_good_section=0 skip=0 } $1 == "(B)" { skip=8 on_good_section=1 } on_good_section == 1 { if (skip == 0) { Ka=$8 Ks=$11 print Ka, Ks exit } else { skip -= 1 } } ' "./tmp/yn" } echo "gene_a,gene_b,ka,ks" > "${output_file}" input="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/Glycine_max_duplicate_gene_pairs.tsv" n=$(wc -l "${input}") i=0 while IFS= read -r line do echo $i / $n gene_a=$(echo $line | awk '{print $1}') gene_b=$(echo $line | awk '{print $2}') i=$(expr $i + 1) kaks=$(compute_kaks $gene_a $gene_b | tail -1 | awk 'NF == 2') if [[ ! -z $kaks ]] then arr=(${kaks//\t/ }) echo "${gene_a},${gene_b},${arr[0]},${arr[1]}" >> "${output_file}" fi done < "${input}"