#!/usr/bin/env bash set -euo pipefail NB_THREADS=16 output_file="Glycine_max_duplicate_genes_KaKs_v3.csv" proteome_fasta="/home/sortion/Documents/course/master/M2/S1/comparative_genomics/project/tmp/proteome_filtered.fa" cds_fasta="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/cds_filtered.fa" input_file="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/Glycine_max_duplicate_gene_pairs.csv" compute_kaks() { local gene_id_1 local gene_id_2 gene_id_1="${1}" gene_id_2="${2}" # Extract the proteins ../../scripts/extract_fasta_records.sh "${gene_id_1}" "${gene_id_2}" "${proteome_fasta}" "./prot.fst" # Extract the CDS ../../scripts/extract_fasta_records.sh "${gene_id_1}" "${gene_id_2}" "${cds_fasta}" "./cds.fst" # Run clustalw2 clustalw2 -quiet -align -infile="./prot.fst" -outfile="./prot.ali.aln" # Run Pal2Nal pal2nal.pl "./prot.ali.aln" "./cds.fst" -output paml >"./cds.ali.phy" # Run yn00 yn00 # Extract Ka/Ks awk ' BEGIN { on_good_section=0 skip=0 } $1 == "(B)" { skip=8 on_good_section=1 } on_good_section == 1 { if (skip == 0) { Ka=$8 Ks=$11 print Ka, Ks exit } else { skip -= 1 } } ' "./yn" } thread() { local thread_id="${1}" local start="${2}" local end="${3}" mkdir -p "tmp/${thread_id}" pushd "tmp/${thread_id}" || return echo "seqfile = ./cds.ali.phy outfile = ./yn verbose = 0 icode = 0 weighting = 0 commonf3x4 = 0" >yn00.ctl echo "" >"part.csv" for line_index in $(seq $start $end); do line=$(head -n "${line_index}" "${input_file}" | tail -n 1) gene_a=$(echo "${line}" | awk '{print $1}') gene_b=$(echo "${line}" | awk '{print $2}') kaks=$(compute_kaks "${gene_a}" "${gene_b}" | tail -1 | awk 'NF == 2') if [[ ! -z $kaks ]]; then arr=(${kaks//\t/ }) echo "${gene_a},${gene_b},${arr[0]},${arr[1]}" >>"part.csv" fi done popd || return } main() { part=$(($(cat "${input_file}" | wc -l) / ${NB_THREADS})) for thread_id in $(seq 0 $NB_THREADS); do start=$((thread_id * part)) end=$(($((thread_id + 1)) * part)) thread $thread_id $start $end & done wait echo "gene_a,gene_b,ka,ks" >"${output_file}" for thread_id in $(seq 0 $NB_THREADS); do cat ./tmp/$thread_id/part.csv >>"${output_file}" done } main