133 lines
2.7 KiB
Plaintext
133 lines
2.7 KiB
Plaintext
|
|
||
|
process EXTRACT_TWO_PROTEINS {
|
||
|
input:
|
||
|
tuple(val(gene_id_1), val(gene_id_2))
|
||
|
path fasta_file
|
||
|
output:
|
||
|
path "${gene_id_1}_${gene_id_2}.prot.fst"
|
||
|
script:
|
||
|
output_file="${gene_id_1}_${gene_id_2}.prot.fst"
|
||
|
"""
|
||
|
bash "${baseDir}/scripts/extract_fasta_records.sh" "${gene_id_1}" "${gene_id_2}" "${fasta_file}" "${output_file}"
|
||
|
"""
|
||
|
}
|
||
|
|
||
|
process EXTRACT_TWO_CDS {
|
||
|
input:
|
||
|
tuple(val(gene_id_1), val(gene_id_2))
|
||
|
path fasta_file
|
||
|
output:
|
||
|
path "${gene_id_1}_${gene_id_2}.cds.fst"
|
||
|
script:
|
||
|
output_file="${gene_id_1}_${gene_id_2}.cds.fst"
|
||
|
"""
|
||
|
bash "${baseDir}/scripts/extract_fasta_records.sh" "${gene_id_1}" "${gene_id_2}" "${fasta_file}" "${output_file}"
|
||
|
"""
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
process CLUSTALW2 {
|
||
|
|
||
|
label 'clustalw'
|
||
|
|
||
|
input:
|
||
|
path protein_sequence
|
||
|
output:
|
||
|
path "${protein_sequence.simpleName}.prot.ali.aln"
|
||
|
script:
|
||
|
"""
|
||
|
clustalw2 -quiet -align -infile="${protein_sequence}" -outfile="${protein_sequence.simpleName}.prot.ali.aln"
|
||
|
"""
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
process PAL2NAL {
|
||
|
|
||
|
label 'pal2nal'
|
||
|
|
||
|
input:
|
||
|
path protein_alignment
|
||
|
path coding_sequence
|
||
|
output:
|
||
|
path "${protein_alignment.simpleName}.cds.ali.phy"
|
||
|
script:
|
||
|
"""
|
||
|
pal2nal.pl "${protein_alignment}" "${coding_sequence}" -output paml > "${protein_alignment.simpleName}.cds.ali.phy"
|
||
|
"""
|
||
|
}
|
||
|
|
||
|
process YN00 {
|
||
|
|
||
|
label 'paml'
|
||
|
|
||
|
input:
|
||
|
path phylip_file
|
||
|
output:
|
||
|
path "${phylip_file.simpleName}.yn"
|
||
|
script:
|
||
|
"""
|
||
|
echo "seqfile = ${phylip_file} \noutfile = ${phylip_file.simpleName}.yn \nverbose = 0\nicode = 0\nweighting = 0\ncommonf3x4 = 0" > yn00.ctl
|
||
|
yn00
|
||
|
"""
|
||
|
}
|
||
|
|
||
|
process EXTRACT_KA_KS {
|
||
|
|
||
|
input:
|
||
|
tuple(val(gene_id_1), val(gene_id_2))
|
||
|
path yn_file
|
||
|
output:
|
||
|
path 'csv_row'
|
||
|
script:
|
||
|
"""
|
||
|
KaKs=\$(awk '
|
||
|
BEGIN {
|
||
|
OFS=","
|
||
|
on_good_section=0
|
||
|
skip=0
|
||
|
}
|
||
|
\$1 == "(B)" {
|
||
|
skip=8
|
||
|
on_good_section=1
|
||
|
}
|
||
|
|
||
|
on_good_section == 1 {
|
||
|
if (skip == 0) {
|
||
|
Ka=\$8
|
||
|
Ks=\$11
|
||
|
print Ka, Ks
|
||
|
exit
|
||
|
} else {
|
||
|
skip -= 1
|
||
|
}
|
||
|
}
|
||
|
' "${yn_file}")
|
||
|
arr=(\${KaKs//,/ })
|
||
|
Ka=\${arr[0]}
|
||
|
Ks=\${arr[1]}
|
||
|
echo "${gene_id_1}\t${gene_id_2}\t\${Ka}\t\${Ks}" > csv_row
|
||
|
"""
|
||
|
}
|
||
|
|
||
|
workflow KA_KS {
|
||
|
|
||
|
take:
|
||
|
gene_id_pair
|
||
|
proteome_fasta
|
||
|
cds_fasta
|
||
|
|
||
|
main:
|
||
|
protein_sequences = EXTRACT_TWO_PROTEINS(gene_id_pair, proteome_fasta)
|
||
|
cds_sequences = EXTRACT_TWO_CDS(gene_id_pair, cds_fasta)
|
||
|
protein_alignment = CLUSTALW2(protein_sequences)
|
||
|
phylip = PAL2NAL(protein_alignment, cds_sequences)
|
||
|
yn = YN00(phylip)
|
||
|
kaks = EXTRACT_KA_KS(gene_id_pair, yn)
|
||
|
|
||
|
emit:
|
||
|
kaks = kaks
|
||
|
}
|