97 lines
2.5 KiB
Bash
97 lines
2.5 KiB
Bash
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
NB_THREADS=16
|
|
|
|
output_file="Glycine_max_duplicate_genes_KaKs_v3.csv"
|
|
proteome_fasta="/home/sortion/Documents/course/master/M2/S1/comparative_genomics/project/tmp/proteome_filtered.fa"
|
|
cds_fasta="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/cds_filtered.fa"
|
|
input_file="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/Glycine_max_duplicate_gene_pairs.csv"
|
|
|
|
compute_kaks() {
|
|
local gene_id_1
|
|
local gene_id_2
|
|
gene_id_1="${1}"
|
|
gene_id_2="${2}"
|
|
# Extract the proteins
|
|
../../scripts/extract_fasta_records.sh "${gene_id_1}" "${gene_id_2}" "${proteome_fasta}" "./prot.fst"
|
|
# Extract the CDS
|
|
../../scripts/extract_fasta_records.sh "${gene_id_1}" "${gene_id_2}" "${cds_fasta}" "./cds.fst"
|
|
|
|
# Run clustalw2
|
|
clustalw2 -quiet -align -infile="./prot.fst" -outfile="./prot.ali.aln"
|
|
|
|
# Run Pal2Nal
|
|
pal2nal.pl "./prot.ali.aln" "./cds.fst" -output paml >"./cds.ali.phy"
|
|
|
|
# Run yn00
|
|
yn00
|
|
|
|
# Extract Ka/Ks
|
|
awk '
|
|
BEGIN {
|
|
on_good_section=0
|
|
skip=0
|
|
}
|
|
$1 == "(B)" {
|
|
skip=8
|
|
on_good_section=1
|
|
}
|
|
|
|
on_good_section == 1 {
|
|
if (skip == 0) {
|
|
Ka=$8
|
|
Ks=$11
|
|
print Ka, Ks
|
|
exit
|
|
} else {
|
|
skip -= 1
|
|
}
|
|
}
|
|
' "./yn"
|
|
}
|
|
|
|
thread() {
|
|
local thread_id="${1}"
|
|
local start="${2}"
|
|
local end="${3}"
|
|
|
|
mkdir -p "tmp/${thread_id}"
|
|
pushd "tmp/${thread_id}" || return
|
|
|
|
echo "seqfile = ./cds.ali.phy
|
|
outfile = ./yn
|
|
verbose = 0
|
|
icode = 0
|
|
weighting = 0
|
|
commonf3x4 = 0" >yn00.ctl
|
|
echo "" >"part.csv"
|
|
for line_index in $(seq $start $end); do
|
|
line=$(head -n "${line_index}" "${input_file}" | tail -n 1)
|
|
gene_a=$(echo "${line}" | awk '{print $1}')
|
|
gene_b=$(echo "${line}" | awk '{print $2}')
|
|
kaks=$(compute_kaks "${gene_a}" "${gene_b}" | tail -1 | awk 'NF == 2')
|
|
if [[ ! -z $kaks ]]; then
|
|
arr=(${kaks//\t/ })
|
|
echo "${gene_a},${gene_b},${arr[0]},${arr[1]}" >>"part.csv"
|
|
fi
|
|
done
|
|
popd || return
|
|
}
|
|
|
|
main() {
|
|
part=$(($(cat "${input_file}" | wc -l) / ${NB_THREADS}))
|
|
for thread_id in $(seq 0 $NB_THREADS); do
|
|
start=$((thread_id * part))
|
|
end=$(($((thread_id + 1)) * part))
|
|
thread $thread_id $start $end &
|
|
done
|
|
wait
|
|
echo "gene_a,gene_b,ka,ks" >"${output_file}"
|
|
for thread_id in $(seq 0 $NB_THREADS); do
|
|
cat ./tmp/$thread_id/part.csv >>"${output_file}"
|
|
done
|
|
}
|
|
|
|
main
|