comparative-genomics-project/workflow/KaKs/modules/ka_ks.nf

133 lines
2.7 KiB
Plaintext

process EXTRACT_TWO_PROTEINS {
input:
tuple(val(gene_id_1), val(gene_id_2))
path fasta_file
output:
path "${gene_id_1}_${gene_id_2}.prot.fst"
script:
output_file="${gene_id_1}_${gene_id_2}.prot.fst"
"""
bash "${baseDir}/scripts/extract_fasta_records.sh" "${gene_id_1}" "${gene_id_2}" "${fasta_file}" "${output_file}"
"""
}
process EXTRACT_TWO_CDS {
input:
tuple(val(gene_id_1), val(gene_id_2))
path fasta_file
output:
path "${gene_id_1}_${gene_id_2}.cds.fst"
script:
output_file="${gene_id_1}_${gene_id_2}.cds.fst"
"""
bash "${baseDir}/scripts/extract_fasta_records.sh" "${gene_id_1}" "${gene_id_2}" "${fasta_file}" "${output_file}"
"""
}
process CLUSTALW2 {
label 'clustalw'
input:
path protein_sequence
output:
path "${protein_sequence.simpleName}.prot.ali.aln"
script:
"""
clustalw2 -quiet -align -infile="${protein_sequence}" -outfile="${protein_sequence.simpleName}.prot.ali.aln"
"""
}
process PAL2NAL {
label 'pal2nal'
input:
path protein_alignment
path coding_sequence
output:
path "${protein_alignment.simpleName}.cds.ali.phy"
script:
"""
pal2nal.pl "${protein_alignment}" "${coding_sequence}" -output paml > "${protein_alignment.simpleName}.cds.ali.phy"
"""
}
process YN00 {
label 'paml'
input:
path phylip_file
output:
path "${phylip_file.simpleName}.yn"
script:
"""
echo "seqfile = ${phylip_file} \noutfile = ${phylip_file.simpleName}.yn \nverbose = 0\nicode = 0\nweighting = 0\ncommonf3x4 = 0" > yn00.ctl
yn00
"""
}
process EXTRACT_KA_KS {
input:
tuple(val(gene_id_1), val(gene_id_2))
path yn_file
output:
path 'csv_row'
script:
"""
KaKs=\$(awk '
BEGIN {
OFS=","
on_good_section=0
skip=0
}
\$1 == "(B)" {
skip=8
on_good_section=1
}
on_good_section == 1 {
if (skip == 0) {
Ka=\$8
Ks=\$11
print Ka, Ks
exit
} else {
skip -= 1
}
}
' "${yn_file}")
arr=(\${KaKs//,/ })
Ka=\${arr[0]}
Ks=\${arr[1]}
echo "${gene_id_1}\t${gene_id_2}\t\${Ka}\t\${Ks}" > csv_row
"""
}
workflow KA_KS {
take:
gene_id_pair
proteome_fasta
cds_fasta
main:
protein_sequences = EXTRACT_TWO_PROTEINS(gene_id_pair, proteome_fasta)
cds_sequences = EXTRACT_TWO_CDS(gene_id_pair, cds_fasta)
protein_alignment = CLUSTALW2(protein_sequences)
phylip = PAL2NAL(protein_alignment, cds_sequences)
yn = YN00(phylip)
kaks = EXTRACT_KA_KS(gene_id_pair, yn)
emit:
kaks = kaks
}