Add a workflow to extract Ka/Ks values

Nextflow workflow, but is stuck with memory overflow.
This commit is contained in:
Samuel Ortion 2024-12-24 11:28:30 +01:00
parent f766ebcc4a
commit 5322a9686a
Signed by: sortion
GPG Key ID: 9B02406F8C4FB765
8 changed files with 1236 additions and 0 deletions

1009
notebook.org Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,6 @@
name: clustalw
channels:
- conda-forge
- bioconda
dependencies:
- bioconda::clustalw

View File

@ -0,0 +1,6 @@
name: pal2nal
channels:
- conda-forge
- bioconda
dependencies:
- bioconda::pal2nal

View File

@ -0,0 +1,6 @@
name: paml
channels:
- conda-forge
- bioconda
dependencies:
- bioconda::paml

39
workflow/KaKs/main.nf Normal file
View File

@ -0,0 +1,39 @@
/** Compute Ka, Ks values for each
/** pair of duplicate genes in a TAG
/** and in a gene family
/** */
include { GUNZIP as GUNZIP_CDS } from "./modules/gunzip.nf"
include { KA_KS } from "./modules/ka_ks.nf"
process GENE_PAIRS {
input:
path families
output:
path 'pairs'
script:
"""
cat "${families}" | ${baseDir}/../../rust/pairs/target/release/pairs > 'pairs'
"""
}
workflow {
families = params.families
cds_gz = params.cds
proteome = params.proteome
cds = GUNZIP_CDS(cds_gz)
pairs = GENE_PAIRS(families)
.splitCsv(sep: '\t', header: false)
.map { row -> tuple(row[0], row[1]) }
kaks = KA_KS(pairs, proteome, cds)
// Save to a CSV with collectFile
header = Channel.value("gene_id_1\tgene_id_2\tKa\tKs")
header.concat( kaks )
.collectFile( name: 'test.txt', newLine: true, sort: false )
.view()
}

View File

@ -0,0 +1,12 @@
process GUNZIP {
input:
path zipped_file
output:
path zipped_file.baseName
script:
"""
gunzip -c "${zipped_file}" > "${zipped_file.baseName}"
"""
}

View File

@ -0,0 +1,132 @@
process EXTRACT_TWO_PROTEINS {
input:
tuple(val(gene_id_1), val(gene_id_2))
path fasta_file
output:
path "${gene_id_1}_${gene_id_2}.prot.fst"
script:
output_file="${gene_id_1}_${gene_id_2}.prot.fst"
"""
bash "${baseDir}/scripts/extract_fasta_records.sh" "${gene_id_1}" "${gene_id_2}" "${fasta_file}" "${output_file}"
"""
}
process EXTRACT_TWO_CDS {
input:
tuple(val(gene_id_1), val(gene_id_2))
path fasta_file
output:
path "${gene_id_1}_${gene_id_2}.cds.fst"
script:
output_file="${gene_id_1}_${gene_id_2}.cds.fst"
"""
bash "${baseDir}/scripts/extract_fasta_records.sh" "${gene_id_1}" "${gene_id_2}" "${fasta_file}" "${output_file}"
"""
}
process CLUSTALW2 {
label 'clustalw'
input:
path protein_sequence
output:
path "${protein_sequence.simpleName}.prot.ali.aln"
script:
"""
clustalw2 -quiet -align -infile="${protein_sequence}" -outfile="${protein_sequence.simpleName}.prot.ali.aln"
"""
}
process PAL2NAL {
label 'pal2nal'
input:
path protein_alignment
path coding_sequence
output:
path "${protein_alignment.simpleName}.cds.ali.phy"
script:
"""
pal2nal.pl "${protein_alignment}" "${coding_sequence}" -output paml > "${protein_alignment.simpleName}.cds.ali.phy"
"""
}
process YN00 {
label 'paml'
input:
path phylip_file
output:
path "${phylip_file.simpleName}.yn"
script:
"""
echo "seqfile = ${phylip_file} \noutfile = ${phylip_file.simpleName}.yn \nverbose = 0\nicode = 0\nweighting = 0\ncommonf3x4 = 0" > yn00.ctl
yn00
"""
}
process EXTRACT_KA_KS {
input:
tuple(val(gene_id_1), val(gene_id_2))
path yn_file
output:
path 'csv_row'
script:
"""
KaKs=\$(awk '
BEGIN {
OFS=","
on_good_section=0
skip=0
}
\$1 == "(B)" {
skip=8
on_good_section=1
}
on_good_section == 1 {
if (skip == 0) {
Ka=\$8
Ks=\$11
print Ka, Ks
exit
} else {
skip -= 1
}
}
' "${yn_file}")
arr=(\${KaKs//,/ })
Ka=\${arr[0]}
Ks=\${arr[1]}
echo "${gene_id_1}\t${gene_id_2}\t\${Ka}\t\${Ks}" > csv_row
"""
}
workflow KA_KS {
take:
gene_id_pair
proteome_fasta
cds_fasta
main:
protein_sequences = EXTRACT_TWO_PROTEINS(gene_id_pair, proteome_fasta)
cds_sequences = EXTRACT_TWO_CDS(gene_id_pair, cds_fasta)
protein_alignment = CLUSTALW2(protein_sequences)
phylip = PAL2NAL(protein_alignment, cds_sequences)
yn = YN00(phylip)
kaks = EXTRACT_KA_KS(gene_id_pair, yn)
emit:
kaks = kaks
}

View File

@ -0,0 +1,26 @@
#!/usr/bin/env sh
gene_id_1="${1}"
gene_id_2="${2}"
fasta_file="${3}"
output_file="${4}"
echo "" > "${output_file}"
for gene_id in ${gene_id_1} ${gene_id_2}; do
awk -v gene_id="${gene_id}" '
BEGIN {
on_gene=0
}
/^[^>]/ && on_gene == 1 {
print $0
}
/^>/ {
gene = $4
gsub("gene:", "", gene)
if (gene == gene_id) {
on_gene=1
print $0
} else {
on_gene=0
}
}
' "${fasta_file}" >> "${output_file}"
done