Add a workflow to extract Ka/Ks values

Nextflow workflow, but is stuck with memory overflow.
2024-12-24 11:28:30 +01:00 · 2024-12-24 11:28:30 +01:00 · 5322a9686a
parent f766ebcc4a
commit 5322a9686a
8 changed files with 1236 additions and 0 deletions
--- a/notebook.org
+++ b/notebook.org
--- a/workflow/KaKs/conda/clustalw.yaml
+++ b/workflow/KaKs/conda/clustalw.yaml
@ -0,0 +1,6 @@
+name: clustalw
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - bioconda::clustalw
--- a/workflow/KaKs/conda/pal2nal.yaml
+++ b/workflow/KaKs/conda/pal2nal.yaml
@ -0,0 +1,6 @@
+name: pal2nal
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - bioconda::pal2nal
--- a/workflow/KaKs/conda/paml.yaml
+++ b/workflow/KaKs/conda/paml.yaml
@ -0,0 +1,6 @@
+name: paml
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - bioconda::paml
--- a/workflow/KaKs/main.nf
+++ b/workflow/KaKs/main.nf
@ -0,0 +1,39 @@
+/** Compute Ka, Ks values for each
+/** pair of duplicate genes in a TAG
+/** and in a gene family
+/** */
+
+include { GUNZIP as GUNZIP_CDS } from "./modules/gunzip.nf"
+include { KA_KS } from "./modules/ka_ks.nf"
+
+process GENE_PAIRS {
+    input:
+    path families
+    output:
+    path 'pairs'
+    script:
+    """
+    cat "${families}" | ${baseDir}/../../rust/pairs/target/release/pairs > 'pairs'
+    """
+}
+
+workflow {
+    families = params.families
+    cds_gz = params.cds
+    proteome = params.proteome
+
+    cds = GUNZIP_CDS(cds_gz)
+
+    pairs = GENE_PAIRS(families)
+            .splitCsv(sep: '\t', header: false)
+            .map { row -> tuple(row[0], row[1]) }
+
+
+    kaks = KA_KS(pairs, proteome, cds)
+
+    // Save to a CSV with collectFile
+    header = Channel.value("gene_id_1\tgene_id_2\tKa\tKs")
+    header.concat( kaks )
+        .collectFile( name: 'test.txt', newLine: true, sort: false )
+        .view() 
+}
--- a/workflow/KaKs/modules/gunzip.nf
+++ b/workflow/KaKs/modules/gunzip.nf
@ -0,0 +1,12 @@
+process GUNZIP {
+    input:
+    path zipped_file
+
+    output:
+    path zipped_file.baseName
+
+    script:
+    """
+    gunzip -c "${zipped_file}" > "${zipped_file.baseName}"
+    """
+}
--- a/workflow/KaKs/modules/ka_ks.nf
+++ b/workflow/KaKs/modules/ka_ks.nf
@ -0,0 +1,132 @@
+
+process EXTRACT_TWO_PROTEINS {
+    input:
+    tuple(val(gene_id_1), val(gene_id_2))
+    path fasta_file
+    output:
+    path "${gene_id_1}_${gene_id_2}.prot.fst"
+    script:
+    output_file="${gene_id_1}_${gene_id_2}.prot.fst"
+    """
+    bash "${baseDir}/scripts/extract_fasta_records.sh" "${gene_id_1}" "${gene_id_2}" "${fasta_file}" "${output_file}"
+     """
+}
+
+process EXTRACT_TWO_CDS {
+    input:
+    tuple(val(gene_id_1), val(gene_id_2))
+    path fasta_file
+    output:
+    path "${gene_id_1}_${gene_id_2}.cds.fst"
+    script:
+    output_file="${gene_id_1}_${gene_id_2}.cds.fst"
+    """
+    bash "${baseDir}/scripts/extract_fasta_records.sh" "${gene_id_1}" "${gene_id_2}" "${fasta_file}" "${output_file}"
+    """
+}
+
+
+
+process CLUSTALW2 {
+
+    label 'clustalw'
+
+    input:
+    path protein_sequence
+    output:
+    path "${protein_sequence.simpleName}.prot.ali.aln"
+    script:
+    """
+    clustalw2 -quiet -align -infile="${protein_sequence}" -outfile="${protein_sequence.simpleName}.prot.ali.aln"
+    """
+}
+
+
+
+
+process PAL2NAL {
+
+    label 'pal2nal'
+
+    input:
+    path protein_alignment
+    path coding_sequence
+    output:
+    path "${protein_alignment.simpleName}.cds.ali.phy"
+    script:
+    """
+    pal2nal.pl "${protein_alignment}" "${coding_sequence}" -output paml > "${protein_alignment.simpleName}.cds.ali.phy"
+    """
+}
+
+process YN00 {
+
+    label 'paml'
+
+    input:
+    path phylip_file
+    output:
+    path "${phylip_file.simpleName}.yn"
+    script:
+    """
+    echo "seqfile = ${phylip_file} \noutfile = ${phylip_file.simpleName}.yn \nverbose = 0\nicode = 0\nweighting = 0\ncommonf3x4 = 0" > yn00.ctl
+    yn00
+    """
+}
+
+process EXTRACT_KA_KS {
+
+    input:
+    tuple(val(gene_id_1), val(gene_id_2))
+    path yn_file
+    output:
+    path 'csv_row'
+    script:
+    """
+    KaKs=\$(awk '
+    BEGIN {
+          OFS=","
+          on_good_section=0
+          skip=0
+    }
+    \$1 == "(B)" {
+         skip=8
+         on_good_section=1
+    }
+
+    on_good_section == 1 {
+        if (skip == 0) {
+            Ka=\$8
+            Ks=\$11
+            print Ka, Ks
+            exit
+        } else {
+            skip -= 1
+        }
+    }
+    ' "${yn_file}")
+    arr=(\${KaKs//,/ })
+    Ka=\${arr[0]}
+    Ks=\${arr[1]}
+    echo "${gene_id_1}\t${gene_id_2}\t\${Ka}\t\${Ks}" > csv_row
+    """
+}
+
+workflow KA_KS {
+
+    take:
+    gene_id_pair
+    proteome_fasta
+    cds_fasta
+
+    main:
+    protein_sequences = EXTRACT_TWO_PROTEINS(gene_id_pair, proteome_fasta)
+    cds_sequences = EXTRACT_TWO_CDS(gene_id_pair, cds_fasta)
+    protein_alignment = CLUSTALW2(protein_sequences)
+    phylip = PAL2NAL(protein_alignment, cds_sequences)
+    yn = YN00(phylip)
+    kaks = EXTRACT_KA_KS(gene_id_pair, yn)
+
+    emit:
+    kaks = kaks
+}
--- a/workflow/KaKs/scripts/extract_fasta_records.sh
+++ b/workflow/KaKs/scripts/extract_fasta_records.sh
@ -0,0 +1,26 @@
+#!/usr/bin/env sh
+gene_id_1="${1}"
+gene_id_2="${2}"
+fasta_file="${3}"
+output_file="${4}"
+echo "" > "${output_file}"
+for gene_id in ${gene_id_1} ${gene_id_2}; do
+    awk -v gene_id="${gene_id}" '
+        BEGIN {
+            on_gene=0
+        }
+        /^[^>]/ && on_gene == 1 {
+            print $0
+        }
+        /^>/ {
+            gene = $4
+            gsub("gene:", "", gene)
+            if (gene == gene_id) {
+                on_gene=1
+                print $0
+            } else {
+                on_gene=0
+            }
+        }
+        ' "${fasta_file}" >> "${output_file}"
+done