Initial Nextflow workflow

This commit is contained in:
Samuel Ortion 2024-10-28 10:46:05 +01:00
commit 0540e3699e
Signed by: sortion
GPG Key ID: 9B02406F8C4FB765
16 changed files with 412 additions and 0 deletions

5
workflow/conda/blast.yml Normal file
View File

@ -0,0 +1,5 @@
name: blast
channels:
- bioconda
dependencies:
- blast=2.5

5
workflow/conda/mcl.yml Normal file
View File

@ -0,0 +1,5 @@
name: mcl
channels:
- bioconda
dependencies:
- bioconda::mcl=22.282

25
workflow/main.nf Normal file
View File

@ -0,0 +1,25 @@
/**
/** Comparative Genomics workflow
/**
/** This workflow find the duplicate genes from a proteome
/** Then, It finds the Tandemly Arrayed Genes (TAGs)
/**/
nextflow.enable.dsl = 2;
include { GUNZIP } from "./modules/gunzip.nf"
include { BLAST_MAKEBLASTDB } from "./modules/blast.nf"
include { BLAST_BLASTP } from "./modules/blast.nf"
include { FILTER_FASTA } from "./modules/filter_fasta.nf"
include { FILTER_BLASTP } from "./modules/filter_blastp.nf"
include { CLUSTERING } from "./modules/clustering.nf"
workflow {
proteome = Channel.fromPath(params.proteome)
GUNZIP(proteome)
FILTER_FASTA(GUNZIP.out)
BLAST_MAKEBLASTDB(params.species, FILTER_FASTA.out.proteome)
BLAST_BLASTP(params.species, FILTER_FASTA.out.proteome, BLAST_MAKEBLASTDB.out)
FILTER_BLASTP(BLAST_BLASTP.out, FILTER_FASTA.out.protein_length)
CLUSTERING(FILTER_BLASTP.out)
}

39
workflow/modules/blast.nf Normal file
View File

@ -0,0 +1,39 @@
/** blastp all against all */
process BLAST_MAKEBLASTDB {
label 'blast'
input:
val species
path proteome
output:
path 'db.phr'
path 'db.pin'
path 'db.psq'
script:
"""
makeblastdb -in "${proteome}" -dbtype prot -out 'db' -title "${species}"
"""
}
process BLAST_BLASTP {
label 'blast'
input:
val species
path proteome
path 'db.phr'
path 'db.pin'
path 'db.psq'
output:
path "${species}.all-against-all.blastp.tsv"
script:
"""
blastp -query "${proteome}" -db 'db' -outfmt '6' -out "${species}.all-against-all.blastp.tsv"
"""
}

View File

@ -0,0 +1,54 @@
process BLASTP_TO_ABC {
input:
path blastp
output:
path 'graph.abc'
script:
"""
awk 'BEGIN { OFS="\t" } { print \$14, \$16, \$12 }' "${blastp}" > 'graph.abc'
"""
}
process MCL {
input:
path abc
output:
path 'clustering.mcl'
script:
"""
mcl "${abc}" --abc -o 'custering.mcl'
"""
}
process MCL_TO_TSV {
input:
path mcl
output:
path 'families.tsv'
script:
"""
awk -f "${baseDir}/scripts/mcl_to_tsv.awk" "${mcl}" > 'families.tsv'
"""
}
workflow CLUSTERING {
take:
blastp_tsv
main:
BLASTP_TO_ABC(blastp_tsv)
MCL(BLASTP_TO_ABC).out
MCL_TO_TSV(MCL.out)
emit:
families = MCL_TO_TSV.out
}

View File

View File

@ -0,0 +1,25 @@
/** Filter blastp output based on coverage and identity percentage
/**/
process FILTER_BLASTP {
input:
val min_coverage
val min_identity
path blastp
path protein_length
output:
path 'filtered_blastp.tsv'
script:
"""
sort "${blastp}" > blastp_s
sort "${protein_length}" > protein_length_s
join -1 1 -2 1 "blastp_s" "protein_length_s" > join1.tsv
sort -k 2 "join1.tsv" > join1.tsv_s
join -1 2 -2 1 "join1.tsv_s" "protein_length_s" > 'joined.blastp.tsv'
awk -f "${baseDir}/scripts/filter_blastp.awk" -v coverage="${min_coverage}" -v identity "${min_identity}" \
"${blastp}" > 'filtered_blastp.tsv'
"""
}

View File

@ -0,0 +1,57 @@
/** 1. Filter out proteome sequence belonging to 'supercontig' */
/** 2. Filter proteome to keep only the longest isoform */
process FILTER_SUPERCONTIG {
input:
path proteome
output:
path 'proteome_nosupercontig.fasta'
script:
"""
awk -f "${baseDir}/scripts/remove_supercontigs.awk" "${proteome}" > 'proteome_nosupercontig.fasta'
"""
}
process SEQUENCE_LENGTH {
input:
path proteome
output:
path 'protein_lengths.tsv'
script:
"""
awk -f "${baseDir}/scripts/protein_lengths.awk" "${proteome}" > 'protein_lengths.tsv'
"""
}
process PROTEOME_LONGEST {
input:
path proteome
path protein_lengths
output:
path 'proteome_filtered.fa'
script:
"""
awk -f "${baseDir}/scripts/filter_longest.awk" "${protein_lengths}" "${proteome}" > 'records_filtered.list'
awk -f "${baseDir}/scripts/filter_records_fasta.awk" 'records_filtered.list' "${proteome}" > 'proteome_filtered.fa'
"""
}
workflow FILTER_FASTA {
take:
proteome
main:
FILTER_SUPERCONTIG(proteome)
SEQUENCE_LENGTH(FILTER_SUPERCONTIG.out)
PROTEOME_LONGEST(FILTER_SUPERCONTIG.out, SEQUENCE_LENGTH.out)
emit:
lengths = SEQUENCE_LENGTH.out
proteome = PROTEOME_LONGEST.out
}

View File

@ -0,0 +1,12 @@
process GUNZIP {
input:
path zipped_file
output:
path zipped_file.baseName
script:
"""
gunzip -c "${zipped_file}" > "${zipped_file.baseName}"
"""
}

21
workflow/nextflow.config Normal file
View File

@ -0,0 +1,21 @@
params {
proteome = "${baseDir}/../../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz"
species = "Glycine_max"
results = "results"
}
profiles {
conda {
conda.enabled = true
process {
withLabel: blast {
conda = "$baseDir/conda/blast.yml"
}
withLabel: mcl {
conda = "$baseDir/conda/mcl.yml"
}
}
}
}

View File

@ -0,0 +1,35 @@
#!/usr/bin/env -S awk -f
# Usage: $0 -v min_identity=30 \
# -v min_coverage=75 \
# ./file.blastp.tsv
# The blastp.tsv file should have
# the four following supplementary colums
# query length,
# query gene ID,
# subject length,
# subject gene ID
BEGIN {
OFS="\t"
}
function get_coverage(sequence_length, sequence_start, sequence_end) {
return (sequence_end - sequence_start + 1) / sequence_length * 100
}
{
qgeneid=$14
sgeneid=$16
qstart=$7
qend=$8
sstart=$9
send=$10
qlength=$13
slength=$15
identity=$3
evalue=$11
bitscore=$12
if (get_coverage(qlength, qstart, qend) > min_coverage && get_coverage(slength, sstart, send) > min_coverage && identity > min_identity) {
print $0
}
}

View File

@ -0,0 +1,34 @@
#!/usr/bin/env -S awk -f
# Extract the FASTA record id with the longest sequence.
# Takes a two columns file with record id, record length
# and the FASTA file.
#
# Usage: filter_longest.awk protein_lengths.tsv proteins.fasta
NR==FNR {
gsub(">", "", $1)
protein_length[$1] = $2
next
}
{
gene_id = $4
gsub("gene:", "", gene_id)
isoform_id = $0
gsub(">", "", isoform_id)
if (gene_id in max_length) {
if (protein_length[isoform_id] > max_length[gene_id]) {
max_length[gene_id] = protein_length[isoform_id]
max_length_isoform[gene_id] = isoform_id
}
} else {
max_length[gene_id] = protein_length[isoform_id]
max_length_isoform[gene_id] = isoform_id
}
}
END {
for (gene_id in max_length_isoform) {
print max_length_isoform[gene_id]
}
}

View File

@ -0,0 +1,32 @@
#!/usr/bin/env -S awk -f
# Filter FASTA file
# to keep only the records
# with ID in records.list
#
# Usage:
# awk -f filter_record_fasta.awk \
# records.list records.fasta
BEGIN {
on_good_record = 0
}
NR == FNR {
sequence_id = $1
kept[sequence_id] = 1
next
}
/^>/ {
on_good_record = 0
sequence_id = $1
gsub(">", "", sequence_id)
if (sequence_id in kept) {
print $0
on_good_record = 1
}
}
/^[^>]/ && on_good_record {
print $0
}

View File

@ -0,0 +1,16 @@
#!/usr/bin/env -S awk -f
# Convert mcl output into tsv output
# with two columns 'gene id, family identifier'
BEGIN {
family_identifier=0
OFS="\t"
}
{
family_identifier++
split($0, gene_list, " ")
for (gene in gene_list) {
print gene, family_identifier
}
}

View File

@ -0,0 +1,27 @@
#!/usr/bin/env -S awk -f
# Associate a isoform id to the length of the sequence
BEGIN {
sequence_length=0
isoform_id=""
}
/^>/ {
if (isoform_id != "") {
print isoform_id, sequence_length
} else {
isoform_id = $1
gsub(">", "", isoform_id)
sequence_length = 0
}
}
/^[^>]/ {
sequence_length += length($0)
}
END {
if (isoform_id != "") {
print isoform_id, sequence_length
}
}

View File

@ -0,0 +1,25 @@
#!/usr/bin/env -S awk -f
# Filter out 'supercontigs' records
# from an Ensembl proteome fasta file
# Usage:
# awk -f remove_supercontigs.awk \
# "proteome.fasta" \
# > "proteome_nosupercontigs.fasta"
BEGIN {
on_good_sequence = 0
}
/^>/ {
on_good_sequence = 0
locus = $3
split(locus, locus_array, ":")
if (locus_array[1] != "supercontig") {
on_good_sequence = 1
print $0
}
}
/^[^>]/ && on_good_sequence {
print $0
}