Add a bash script to compute Ka/Ks
This commit is contained in:
parent
5322a9686a
commit
3e3c184153
|
@ -0,0 +1,76 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
# Bash version of the Nextflow script that should run faster
|
||||||
|
output_file="Glycine_max_duplicate_genes_KaKs.tsv"
|
||||||
|
proteome_fasta="/home/sortion/Documents/course/master/M2/S1/comparative_genomics/project/tmp/proteome_filtered.fa"
|
||||||
|
cds_fasta="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.cds.all.fa"
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
mkdir -p tmp
|
||||||
|
|
||||||
|
echo "seqfile = ./tmp/cds.ali.phy
|
||||||
|
outfile = ./tmp/yn
|
||||||
|
verbose = 0
|
||||||
|
icode = 0
|
||||||
|
weighting = 0
|
||||||
|
commonf3x4 = 0" > yn00.ctl
|
||||||
|
compute_kaks() {
|
||||||
|
local gene_id_1
|
||||||
|
local gene_id_2
|
||||||
|
gene_id_1=$1
|
||||||
|
gene_id_2=$2
|
||||||
|
# Extract the proteins
|
||||||
|
./scripts/extract_fasta_records.sh $gene_id_1 $gene_id_2 ${proteome_fasta} "./tmp/prot.fst"
|
||||||
|
# Extract the CDS
|
||||||
|
./scripts/extract_fasta_records.sh $gene_id_1 $gene_id_2 ${cds_fasta} "./tmp/cds.fst"
|
||||||
|
|
||||||
|
# Run clustalw2
|
||||||
|
clustalw2 -quiet -align -infile="./tmp/prot.fst" -outfile="./tmp/prot.ali.aln"
|
||||||
|
|
||||||
|
# Run Pal2Nal
|
||||||
|
pal2nal.pl "./tmp/prot.ali.aln" "./tmp/cds.fst" -output paml > "./tmp/cds.ali.phy"
|
||||||
|
|
||||||
|
# Run yn00
|
||||||
|
yn00
|
||||||
|
|
||||||
|
# Extract Ka/Ks
|
||||||
|
awk '
|
||||||
|
BEGIN {
|
||||||
|
on_good_section=0
|
||||||
|
skip=0
|
||||||
|
}
|
||||||
|
$1 == "(B)" {
|
||||||
|
skip=8
|
||||||
|
on_good_section=1
|
||||||
|
}
|
||||||
|
|
||||||
|
on_good_section == 1 {
|
||||||
|
if (skip == 0) {
|
||||||
|
Ka=$8
|
||||||
|
Ks=$11
|
||||||
|
print Ka, Ks
|
||||||
|
exit
|
||||||
|
} else {
|
||||||
|
skip -= 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
' "./tmp/yn"
|
||||||
|
}
|
||||||
|
echo "gene_a,gene_b,ka,ks" > "${output_file}"
|
||||||
|
input="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/Glycine_max_duplicate_gene_pairs.tsv"
|
||||||
|
n=$(wc -l "${input}")
|
||||||
|
i=0
|
||||||
|
while IFS= read -r line
|
||||||
|
do
|
||||||
|
echo $i / $n
|
||||||
|
gene_a=$(echo $line | awk '{print $1}')
|
||||||
|
gene_b=$(echo $line | awk '{print $2}')
|
||||||
|
i=$(expr $i + 1)
|
||||||
|
kaks=$(compute_kaks $gene_a $gene_b | tail -1 | awk 'NF == 2')
|
||||||
|
if [[ ! -z $kaks ]]
|
||||||
|
then
|
||||||
|
arr=(${kaks//\t/ })
|
||||||
|
echo "${gene_a},${gene_b},${arr[0]},${arr[1]}" >> "${output_file}"
|
||||||
|
fi
|
||||||
|
done < "${input}"
|
Loading…
Reference in New Issue