Initial Nextflow workflow
This commit is contained in:
commit
0540e3699e
|
@ -0,0 +1,5 @@
|
|||
name: blast
|
||||
channels:
|
||||
- bioconda
|
||||
dependencies:
|
||||
- blast=2.5
|
|
@ -0,0 +1,5 @@
|
|||
name: mcl
|
||||
channels:
|
||||
- bioconda
|
||||
dependencies:
|
||||
- bioconda::mcl=22.282
|
|
@ -0,0 +1,25 @@
|
|||
/**
|
||||
/** Comparative Genomics workflow
|
||||
/**
|
||||
/** This workflow find the duplicate genes from a proteome
|
||||
/** Then, It finds the Tandemly Arrayed Genes (TAGs)
|
||||
/**/
|
||||
|
||||
nextflow.enable.dsl = 2;
|
||||
|
||||
include { GUNZIP } from "./modules/gunzip.nf"
|
||||
include { BLAST_MAKEBLASTDB } from "./modules/blast.nf"
|
||||
include { BLAST_BLASTP } from "./modules/blast.nf"
|
||||
include { FILTER_FASTA } from "./modules/filter_fasta.nf"
|
||||
include { FILTER_BLASTP } from "./modules/filter_blastp.nf"
|
||||
include { CLUSTERING } from "./modules/clustering.nf"
|
||||
|
||||
workflow {
|
||||
proteome = Channel.fromPath(params.proteome)
|
||||
GUNZIP(proteome)
|
||||
FILTER_FASTA(GUNZIP.out)
|
||||
BLAST_MAKEBLASTDB(params.species, FILTER_FASTA.out.proteome)
|
||||
BLAST_BLASTP(params.species, FILTER_FASTA.out.proteome, BLAST_MAKEBLASTDB.out)
|
||||
FILTER_BLASTP(BLAST_BLASTP.out, FILTER_FASTA.out.protein_length)
|
||||
CLUSTERING(FILTER_BLASTP.out)
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
/** blastp all against all */
|
||||
|
||||
process BLAST_MAKEBLASTDB {
|
||||
|
||||
label 'blast'
|
||||
|
||||
input:
|
||||
val species
|
||||
path proteome
|
||||
|
||||
output:
|
||||
path 'db.phr'
|
||||
path 'db.pin'
|
||||
path 'db.psq'
|
||||
|
||||
script:
|
||||
"""
|
||||
makeblastdb -in "${proteome}" -dbtype prot -out 'db' -title "${species}"
|
||||
"""
|
||||
}
|
||||
|
||||
process BLAST_BLASTP {
|
||||
|
||||
label 'blast'
|
||||
|
||||
input:
|
||||
val species
|
||||
path proteome
|
||||
path 'db.phr'
|
||||
path 'db.pin'
|
||||
path 'db.psq'
|
||||
output:
|
||||
path "${species}.all-against-all.blastp.tsv"
|
||||
|
||||
script:
|
||||
"""
|
||||
blastp -query "${proteome}" -db 'db' -outfmt '6' -out "${species}.all-against-all.blastp.tsv"
|
||||
"""
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
process BLASTP_TO_ABC {
|
||||
|
||||
input:
|
||||
path blastp
|
||||
|
||||
output:
|
||||
path 'graph.abc'
|
||||
|
||||
script:
|
||||
"""
|
||||
awk 'BEGIN { OFS="\t" } { print \$14, \$16, \$12 }' "${blastp}" > 'graph.abc'
|
||||
"""
|
||||
}
|
||||
|
||||
process MCL {
|
||||
|
||||
input:
|
||||
path abc
|
||||
|
||||
output:
|
||||
path 'clustering.mcl'
|
||||
|
||||
script:
|
||||
"""
|
||||
mcl "${abc}" --abc -o 'custering.mcl'
|
||||
"""
|
||||
}
|
||||
|
||||
process MCL_TO_TSV {
|
||||
|
||||
input:
|
||||
path mcl
|
||||
|
||||
output:
|
||||
path 'families.tsv'
|
||||
|
||||
script:
|
||||
"""
|
||||
awk -f "${baseDir}/scripts/mcl_to_tsv.awk" "${mcl}" > 'families.tsv'
|
||||
"""
|
||||
}
|
||||
|
||||
workflow CLUSTERING {
|
||||
take:
|
||||
blastp_tsv
|
||||
|
||||
main:
|
||||
BLASTP_TO_ABC(blastp_tsv)
|
||||
MCL(BLASTP_TO_ABC).out
|
||||
MCL_TO_TSV(MCL.out)
|
||||
|
||||
emit:
|
||||
families = MCL_TO_TSV.out
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
/** Filter blastp output based on coverage and identity percentage
|
||||
/**/
|
||||
|
||||
process FILTER_BLASTP {
|
||||
|
||||
input:
|
||||
val min_coverage
|
||||
val min_identity
|
||||
path blastp
|
||||
path protein_length
|
||||
|
||||
output:
|
||||
path 'filtered_blastp.tsv'
|
||||
|
||||
script:
|
||||
"""
|
||||
sort "${blastp}" > blastp_s
|
||||
sort "${protein_length}" > protein_length_s
|
||||
join -1 1 -2 1 "blastp_s" "protein_length_s" > join1.tsv
|
||||
sort -k 2 "join1.tsv" > join1.tsv_s
|
||||
join -1 2 -2 1 "join1.tsv_s" "protein_length_s" > 'joined.blastp.tsv'
|
||||
awk -f "${baseDir}/scripts/filter_blastp.awk" -v coverage="${min_coverage}" -v identity "${min_identity}" \
|
||||
"${blastp}" > 'filtered_blastp.tsv'
|
||||
"""
|
||||
}
|
|
@ -0,0 +1,57 @@
|
|||
/** 1. Filter out proteome sequence belonging to 'supercontig' */
|
||||
/** 2. Filter proteome to keep only the longest isoform */
|
||||
|
||||
process FILTER_SUPERCONTIG {
|
||||
input:
|
||||
path proteome
|
||||
|
||||
output:
|
||||
path 'proteome_nosupercontig.fasta'
|
||||
|
||||
script:
|
||||
"""
|
||||
awk -f "${baseDir}/scripts/remove_supercontigs.awk" "${proteome}" > 'proteome_nosupercontig.fasta'
|
||||
"""
|
||||
}
|
||||
|
||||
process SEQUENCE_LENGTH {
|
||||
input:
|
||||
path proteome
|
||||
|
||||
output:
|
||||
path 'protein_lengths.tsv'
|
||||
|
||||
script:
|
||||
"""
|
||||
awk -f "${baseDir}/scripts/protein_lengths.awk" "${proteome}" > 'protein_lengths.tsv'
|
||||
"""
|
||||
}
|
||||
|
||||
process PROTEOME_LONGEST {
|
||||
input:
|
||||
path proteome
|
||||
path protein_lengths
|
||||
|
||||
output:
|
||||
path 'proteome_filtered.fa'
|
||||
|
||||
script:
|
||||
"""
|
||||
awk -f "${baseDir}/scripts/filter_longest.awk" "${protein_lengths}" "${proteome}" > 'records_filtered.list'
|
||||
awk -f "${baseDir}/scripts/filter_records_fasta.awk" 'records_filtered.list' "${proteome}" > 'proteome_filtered.fa'
|
||||
"""
|
||||
}
|
||||
|
||||
workflow FILTER_FASTA {
|
||||
take:
|
||||
proteome
|
||||
|
||||
main:
|
||||
FILTER_SUPERCONTIG(proteome)
|
||||
SEQUENCE_LENGTH(FILTER_SUPERCONTIG.out)
|
||||
PROTEOME_LONGEST(FILTER_SUPERCONTIG.out, SEQUENCE_LENGTH.out)
|
||||
|
||||
emit:
|
||||
lengths = SEQUENCE_LENGTH.out
|
||||
proteome = PROTEOME_LONGEST.out
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
process GUNZIP {
|
||||
input:
|
||||
path zipped_file
|
||||
|
||||
output:
|
||||
path zipped_file.baseName
|
||||
|
||||
script:
|
||||
"""
|
||||
gunzip -c "${zipped_file}" > "${zipped_file.baseName}"
|
||||
"""
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
params {
|
||||
proteome = "${baseDir}/../../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz"
|
||||
species = "Glycine_max"
|
||||
results = "results"
|
||||
}
|
||||
|
||||
profiles {
|
||||
|
||||
conda {
|
||||
conda.enabled = true
|
||||
|
||||
process {
|
||||
withLabel: blast {
|
||||
conda = "$baseDir/conda/blast.yml"
|
||||
}
|
||||
withLabel: mcl {
|
||||
conda = "$baseDir/conda/mcl.yml"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
#!/usr/bin/env -S awk -f
|
||||
# Usage: $0 -v min_identity=30 \
|
||||
# -v min_coverage=75 \
|
||||
# ./file.blastp.tsv
|
||||
# The blastp.tsv file should have
|
||||
# the four following supplementary colums
|
||||
# query length,
|
||||
# query gene ID,
|
||||
# subject length,
|
||||
# subject gene ID
|
||||
|
||||
BEGIN {
|
||||
OFS="\t"
|
||||
}
|
||||
|
||||
function get_coverage(sequence_length, sequence_start, sequence_end) {
|
||||
return (sequence_end - sequence_start + 1) / sequence_length * 100
|
||||
}
|
||||
|
||||
{
|
||||
qgeneid=$14
|
||||
sgeneid=$16
|
||||
qstart=$7
|
||||
qend=$8
|
||||
sstart=$9
|
||||
send=$10
|
||||
qlength=$13
|
||||
slength=$15
|
||||
identity=$3
|
||||
evalue=$11
|
||||
bitscore=$12
|
||||
if (get_coverage(qlength, qstart, qend) > min_coverage && get_coverage(slength, sstart, send) > min_coverage && identity > min_identity) {
|
||||
print $0
|
||||
}
|
||||
}
|
|
@ -0,0 +1,34 @@
|
|||
#!/usr/bin/env -S awk -f
|
||||
# Extract the FASTA record id with the longest sequence.
|
||||
# Takes a two columns file with record id, record length
|
||||
# and the FASTA file.
|
||||
#
|
||||
# Usage: filter_longest.awk protein_lengths.tsv proteins.fasta
|
||||
|
||||
NR==FNR {
|
||||
gsub(">", "", $1)
|
||||
protein_length[$1] = $2
|
||||
next
|
||||
}
|
||||
|
||||
{
|
||||
gene_id = $4
|
||||
gsub("gene:", "", gene_id)
|
||||
isoform_id = $0
|
||||
gsub(">", "", isoform_id)
|
||||
if (gene_id in max_length) {
|
||||
if (protein_length[isoform_id] > max_length[gene_id]) {
|
||||
max_length[gene_id] = protein_length[isoform_id]
|
||||
max_length_isoform[gene_id] = isoform_id
|
||||
}
|
||||
} else {
|
||||
max_length[gene_id] = protein_length[isoform_id]
|
||||
max_length_isoform[gene_id] = isoform_id
|
||||
}
|
||||
}
|
||||
|
||||
END {
|
||||
for (gene_id in max_length_isoform) {
|
||||
print max_length_isoform[gene_id]
|
||||
}
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
#!/usr/bin/env -S awk -f
|
||||
# Filter FASTA file
|
||||
# to keep only the records
|
||||
# with ID in records.list
|
||||
#
|
||||
# Usage:
|
||||
# awk -f filter_record_fasta.awk \
|
||||
# records.list records.fasta
|
||||
|
||||
BEGIN {
|
||||
on_good_record = 0
|
||||
}
|
||||
|
||||
NR == FNR {
|
||||
sequence_id = $1
|
||||
kept[sequence_id] = 1
|
||||
next
|
||||
}
|
||||
|
||||
/^>/ {
|
||||
on_good_record = 0
|
||||
sequence_id = $1
|
||||
gsub(">", "", sequence_id)
|
||||
if (sequence_id in kept) {
|
||||
print $0
|
||||
on_good_record = 1
|
||||
}
|
||||
}
|
||||
|
||||
/^[^>]/ && on_good_record {
|
||||
print $0
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
#!/usr/bin/env -S awk -f
|
||||
# Convert mcl output into tsv output
|
||||
# with two columns 'gene id, family identifier'
|
||||
|
||||
BEGIN {
|
||||
family_identifier=0
|
||||
OFS="\t"
|
||||
}
|
||||
|
||||
{
|
||||
family_identifier++
|
||||
split($0, gene_list, " ")
|
||||
for (gene in gene_list) {
|
||||
print gene, family_identifier
|
||||
}
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
#!/usr/bin/env -S awk -f
|
||||
# Associate a isoform id to the length of the sequence
|
||||
|
||||
BEGIN {
|
||||
sequence_length=0
|
||||
isoform_id=""
|
||||
}
|
||||
|
||||
/^>/ {
|
||||
if (isoform_id != "") {
|
||||
print isoform_id, sequence_length
|
||||
} else {
|
||||
isoform_id = $1
|
||||
gsub(">", "", isoform_id)
|
||||
sequence_length = 0
|
||||
}
|
||||
}
|
||||
|
||||
/^[^>]/ {
|
||||
sequence_length += length($0)
|
||||
}
|
||||
|
||||
END {
|
||||
if (isoform_id != "") {
|
||||
print isoform_id, sequence_length
|
||||
}
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
#!/usr/bin/env -S awk -f
|
||||
# Filter out 'supercontigs' records
|
||||
# from an Ensembl proteome fasta file
|
||||
# Usage:
|
||||
# awk -f remove_supercontigs.awk \
|
||||
# "proteome.fasta" \
|
||||
# > "proteome_nosupercontigs.fasta"
|
||||
|
||||
BEGIN {
|
||||
on_good_sequence = 0
|
||||
}
|
||||
|
||||
/^>/ {
|
||||
on_good_sequence = 0
|
||||
locus = $3
|
||||
split(locus, locus_array, ":")
|
||||
if (locus_array[1] != "supercontig") {
|
||||
on_good_sequence = 1
|
||||
print $0
|
||||
}
|
||||
}
|
||||
|
||||
/^[^>]/ && on_good_sequence {
|
||||
print $0
|
||||
}
|
Loading…
Reference in New Issue