Add a workflow to extract Ka/Ks values
Nextflow workflow, but is stuck with memory overflow.
This commit is contained in:
parent
f766ebcc4a
commit
5322a9686a
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,6 @@
|
|||
name: clustalw
|
||||
channels:
|
||||
- conda-forge
|
||||
- bioconda
|
||||
dependencies:
|
||||
- bioconda::clustalw
|
|
@ -0,0 +1,6 @@
|
|||
name: pal2nal
|
||||
channels:
|
||||
- conda-forge
|
||||
- bioconda
|
||||
dependencies:
|
||||
- bioconda::pal2nal
|
|
@ -0,0 +1,6 @@
|
|||
name: paml
|
||||
channels:
|
||||
- conda-forge
|
||||
- bioconda
|
||||
dependencies:
|
||||
- bioconda::paml
|
|
@ -0,0 +1,39 @@
|
|||
/** Compute Ka, Ks values for each
|
||||
/** pair of duplicate genes in a TAG
|
||||
/** and in a gene family
|
||||
/** */
|
||||
|
||||
include { GUNZIP as GUNZIP_CDS } from "./modules/gunzip.nf"
|
||||
include { KA_KS } from "./modules/ka_ks.nf"
|
||||
|
||||
process GENE_PAIRS {
|
||||
input:
|
||||
path families
|
||||
output:
|
||||
path 'pairs'
|
||||
script:
|
||||
"""
|
||||
cat "${families}" | ${baseDir}/../../rust/pairs/target/release/pairs > 'pairs'
|
||||
"""
|
||||
}
|
||||
|
||||
workflow {
|
||||
families = params.families
|
||||
cds_gz = params.cds
|
||||
proteome = params.proteome
|
||||
|
||||
cds = GUNZIP_CDS(cds_gz)
|
||||
|
||||
pairs = GENE_PAIRS(families)
|
||||
.splitCsv(sep: '\t', header: false)
|
||||
.map { row -> tuple(row[0], row[1]) }
|
||||
|
||||
|
||||
kaks = KA_KS(pairs, proteome, cds)
|
||||
|
||||
// Save to a CSV with collectFile
|
||||
header = Channel.value("gene_id_1\tgene_id_2\tKa\tKs")
|
||||
header.concat( kaks )
|
||||
.collectFile( name: 'test.txt', newLine: true, sort: false )
|
||||
.view()
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
process GUNZIP {
|
||||
input:
|
||||
path zipped_file
|
||||
|
||||
output:
|
||||
path zipped_file.baseName
|
||||
|
||||
script:
|
||||
"""
|
||||
gunzip -c "${zipped_file}" > "${zipped_file.baseName}"
|
||||
"""
|
||||
}
|
|
@ -0,0 +1,132 @@
|
|||
|
||||
process EXTRACT_TWO_PROTEINS {
|
||||
input:
|
||||
tuple(val(gene_id_1), val(gene_id_2))
|
||||
path fasta_file
|
||||
output:
|
||||
path "${gene_id_1}_${gene_id_2}.prot.fst"
|
||||
script:
|
||||
output_file="${gene_id_1}_${gene_id_2}.prot.fst"
|
||||
"""
|
||||
bash "${baseDir}/scripts/extract_fasta_records.sh" "${gene_id_1}" "${gene_id_2}" "${fasta_file}" "${output_file}"
|
||||
"""
|
||||
}
|
||||
|
||||
process EXTRACT_TWO_CDS {
|
||||
input:
|
||||
tuple(val(gene_id_1), val(gene_id_2))
|
||||
path fasta_file
|
||||
output:
|
||||
path "${gene_id_1}_${gene_id_2}.cds.fst"
|
||||
script:
|
||||
output_file="${gene_id_1}_${gene_id_2}.cds.fst"
|
||||
"""
|
||||
bash "${baseDir}/scripts/extract_fasta_records.sh" "${gene_id_1}" "${gene_id_2}" "${fasta_file}" "${output_file}"
|
||||
"""
|
||||
}
|
||||
|
||||
|
||||
|
||||
process CLUSTALW2 {
|
||||
|
||||
label 'clustalw'
|
||||
|
||||
input:
|
||||
path protein_sequence
|
||||
output:
|
||||
path "${protein_sequence.simpleName}.prot.ali.aln"
|
||||
script:
|
||||
"""
|
||||
clustalw2 -quiet -align -infile="${protein_sequence}" -outfile="${protein_sequence.simpleName}.prot.ali.aln"
|
||||
"""
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
process PAL2NAL {
|
||||
|
||||
label 'pal2nal'
|
||||
|
||||
input:
|
||||
path protein_alignment
|
||||
path coding_sequence
|
||||
output:
|
||||
path "${protein_alignment.simpleName}.cds.ali.phy"
|
||||
script:
|
||||
"""
|
||||
pal2nal.pl "${protein_alignment}" "${coding_sequence}" -output paml > "${protein_alignment.simpleName}.cds.ali.phy"
|
||||
"""
|
||||
}
|
||||
|
||||
process YN00 {
|
||||
|
||||
label 'paml'
|
||||
|
||||
input:
|
||||
path phylip_file
|
||||
output:
|
||||
path "${phylip_file.simpleName}.yn"
|
||||
script:
|
||||
"""
|
||||
echo "seqfile = ${phylip_file} \noutfile = ${phylip_file.simpleName}.yn \nverbose = 0\nicode = 0\nweighting = 0\ncommonf3x4 = 0" > yn00.ctl
|
||||
yn00
|
||||
"""
|
||||
}
|
||||
|
||||
process EXTRACT_KA_KS {
|
||||
|
||||
input:
|
||||
tuple(val(gene_id_1), val(gene_id_2))
|
||||
path yn_file
|
||||
output:
|
||||
path 'csv_row'
|
||||
script:
|
||||
"""
|
||||
KaKs=\$(awk '
|
||||
BEGIN {
|
||||
OFS=","
|
||||
on_good_section=0
|
||||
skip=0
|
||||
}
|
||||
\$1 == "(B)" {
|
||||
skip=8
|
||||
on_good_section=1
|
||||
}
|
||||
|
||||
on_good_section == 1 {
|
||||
if (skip == 0) {
|
||||
Ka=\$8
|
||||
Ks=\$11
|
||||
print Ka, Ks
|
||||
exit
|
||||
} else {
|
||||
skip -= 1
|
||||
}
|
||||
}
|
||||
' "${yn_file}")
|
||||
arr=(\${KaKs//,/ })
|
||||
Ka=\${arr[0]}
|
||||
Ks=\${arr[1]}
|
||||
echo "${gene_id_1}\t${gene_id_2}\t\${Ka}\t\${Ks}" > csv_row
|
||||
"""
|
||||
}
|
||||
|
||||
workflow KA_KS {
|
||||
|
||||
take:
|
||||
gene_id_pair
|
||||
proteome_fasta
|
||||
cds_fasta
|
||||
|
||||
main:
|
||||
protein_sequences = EXTRACT_TWO_PROTEINS(gene_id_pair, proteome_fasta)
|
||||
cds_sequences = EXTRACT_TWO_CDS(gene_id_pair, cds_fasta)
|
||||
protein_alignment = CLUSTALW2(protein_sequences)
|
||||
phylip = PAL2NAL(protein_alignment, cds_sequences)
|
||||
yn = YN00(phylip)
|
||||
kaks = EXTRACT_KA_KS(gene_id_pair, yn)
|
||||
|
||||
emit:
|
||||
kaks = kaks
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
#!/usr/bin/env sh
|
||||
gene_id_1="${1}"
|
||||
gene_id_2="${2}"
|
||||
fasta_file="${3}"
|
||||
output_file="${4}"
|
||||
echo "" > "${output_file}"
|
||||
for gene_id in ${gene_id_1} ${gene_id_2}; do
|
||||
awk -v gene_id="${gene_id}" '
|
||||
BEGIN {
|
||||
on_gene=0
|
||||
}
|
||||
/^[^>]/ && on_gene == 1 {
|
||||
print $0
|
||||
}
|
||||
/^>/ {
|
||||
gene = $4
|
||||
gsub("gene:", "", gene)
|
||||
if (gene == gene_id) {
|
||||
on_gene=1
|
||||
print $0
|
||||
} else {
|
||||
on_gene=0
|
||||
}
|
||||
}
|
||||
' "${fasta_file}" >> "${output_file}"
|
||||
done
|
Loading…
Reference in New Issue