process EXTRACT_TWO_PROTEINS { input: tuple(val(gene_id_1), val(gene_id_2)) path fasta_file output: path "${gene_id_1}_${gene_id_2}.prot.fst" script: output_file="${gene_id_1}_${gene_id_2}.prot.fst" """ bash "${baseDir}/scripts/extract_fasta_records.sh" "${gene_id_1}" "${gene_id_2}" "${fasta_file}" "${output_file}" """ } process EXTRACT_TWO_CDS { input: tuple(val(gene_id_1), val(gene_id_2)) path fasta_file output: path "${gene_id_1}_${gene_id_2}.cds.fst" script: output_file="${gene_id_1}_${gene_id_2}.cds.fst" """ bash "${baseDir}/scripts/extract_fasta_records.sh" "${gene_id_1}" "${gene_id_2}" "${fasta_file}" "${output_file}" """ } process CLUSTALW2 { label 'clustalw' input: path protein_sequence output: path "${protein_sequence.simpleName}.prot.ali.aln" script: """ clustalw2 -quiet -align -infile="${protein_sequence}" -outfile="${protein_sequence.simpleName}.prot.ali.aln" """ } process PAL2NAL { label 'pal2nal' input: path protein_alignment path coding_sequence output: path "${protein_alignment.simpleName}.cds.ali.phy" script: """ pal2nal.pl "${protein_alignment}" "${coding_sequence}" -output paml > "${protein_alignment.simpleName}.cds.ali.phy" """ } process YN00 { label 'paml' input: path phylip_file output: path "${phylip_file.simpleName}.yn" script: """ echo "seqfile = ${phylip_file} \noutfile = ${phylip_file.simpleName}.yn \nverbose = 0\nicode = 0\nweighting = 0\ncommonf3x4 = 0" > yn00.ctl yn00 """ } process EXTRACT_KA_KS { input: tuple(val(gene_id_1), val(gene_id_2)) path yn_file output: path 'csv_row' script: """ KaKs=\$(awk ' BEGIN { OFS="," on_good_section=0 skip=0 } \$1 == "(B)" { skip=8 on_good_section=1 } on_good_section == 1 { if (skip == 0) { Ka=\$8 Ks=\$11 print Ka, Ks exit } else { skip -= 1 } } ' "${yn_file}") arr=(\${KaKs//,/ }) Ka=\${arr[0]} Ks=\${arr[1]} echo "${gene_id_1}\t${gene_id_2}\t\${Ka}\t\${Ks}" > csv_row """ } workflow KA_KS { take: gene_id_pair proteome_fasta cds_fasta main: protein_sequences = EXTRACT_TWO_PROTEINS(gene_id_pair, proteome_fasta) cds_sequences = EXTRACT_TWO_CDS(gene_id_pair, cds_fasta) protein_alignment = CLUSTALW2(protein_sequences) phylip = PAL2NAL(protein_alignment, cds_sequences) yn = YN00(phylip) kaks = EXTRACT_KA_KS(gene_id_pair, yn) emit: kaks = kaks }