From 53f6a60943e589b3e60374f2c6ce5293c7dbd3e4 Mon Sep 17 00:00:00 2001 From: Samuel Ortion Date: Mon, 4 Nov 2024 11:37:20 +0100 Subject: [PATCH] Update workflow --- workflow/main.nf | 18 +++++++---- workflow/modules/blast.nf | 15 +++++++++- workflow/modules/clustering.nf | 2 +- workflow/modules/filter_blastp.nf | 16 ++++++---- .../scripts/filter_blastp_sequence_id.awk | 21 +++++++++++++ workflow/scripts/keep_heaviest_edge_abc.awk | 30 +++++++++++++++++++ workflow/scripts/mcl_to_tsv.awk | 6 ++-- workflow/scripts/protein_lengths.awk | 20 ++++++++----- 8 files changed, 105 insertions(+), 23 deletions(-) create mode 100644 workflow/scripts/filter_blastp_sequence_id.awk create mode 100644 workflow/scripts/keep_heaviest_edge_abc.awk diff --git a/workflow/main.nf b/workflow/main.nf index 8e5af86..d556efc 100644 --- a/workflow/main.nf +++ b/workflow/main.nf @@ -8,18 +8,26 @@ nextflow.enable.dsl = 2; include { GUNZIP } from "./modules/gunzip.nf" -include { BLAST_MAKEBLASTDB } from "./modules/blast.nf" -include { BLAST_BLASTP } from "./modules/blast.nf" +include { BLAST_ALL_AGAINST_ALL } from "./modules/blast.nf" include { FILTER_FASTA } from "./modules/filter_fasta.nf" include { FILTER_BLASTP } from "./modules/filter_blastp.nf" include { CLUSTERING } from "./modules/clustering.nf" +process PROTEIN_GENE_MAPPING { + + input: + path proteome + + output: + path 'protein_gene.tsv' +} + workflow { proteome = Channel.fromPath(params.proteome) GUNZIP(proteome) FILTER_FASTA(GUNZIP.out) - BLAST_MAKEBLASTDB(params.species, FILTER_FASTA.out.proteome) - BLAST_BLASTP(params.species, FILTER_FASTA.out.proteome, BLAST_MAKEBLASTDB.out) - FILTER_BLASTP(BLAST_BLASTP.out, FILTER_FASTA.out.protein_length) + BLAST_ALL_AGAINST_ALL(FILTER_FASTA.out.proteome) + FILTER_BLASTP(params.min_coverage, params.min_identity, BLAST_ALL_AGAINST_ALL.out, FILTER_FASTA.out.lengths) + CLUSTERING(FILTER_BLASTP.out) } diff --git a/workflow/modules/blast.nf b/workflow/modules/blast.nf index aa095b8..2bbe8a7 100644 --- a/workflow/modules/blast.nf +++ b/workflow/modules/blast.nf @@ -34,6 +34,19 @@ process BLAST_BLASTP { script: """ - blastp -query "${proteome}" -db 'db' -outfmt '6' -out "${species}.all-against-all.blastp.tsv" + blastp -query "${proteome}" -db 'db' -outfmt '6' -num_threads 7 -out "${species}.all-against-all.blastp.tsv" """ } + +workflow BLAST_ALL_AGAINST_ALL { + + take: + proteome + + main: + BLAST_MAKEBLASTDB(params.species, proteome) + BLAST_BLASTP(params.species, proteome, BLAST_MAKEBLASTDB.out) + + emit: + BLAST_BLASTP.out +} diff --git a/workflow/modules/clustering.nf b/workflow/modules/clustering.nf index 431760e..213be83 100644 --- a/workflow/modules/clustering.nf +++ b/workflow/modules/clustering.nf @@ -46,7 +46,7 @@ workflow CLUSTERING { main: BLASTP_TO_ABC(blastp_tsv) - MCL(BLASTP_TO_ABC).out + MCL(BLASTP_TO_ABC.out) MCL_TO_TSV(MCL.out) emit: diff --git a/workflow/modules/filter_blastp.nf b/workflow/modules/filter_blastp.nf index 125f913..4a54336 100644 --- a/workflow/modules/filter_blastp.nf +++ b/workflow/modules/filter_blastp.nf @@ -1,6 +1,7 @@ /** Filter blastp output based on coverage and identity percentage /**/ + process FILTER_BLASTP { input: @@ -8,18 +9,21 @@ process FILTER_BLASTP { val min_identity path blastp path protein_length + path protein_gene output: path 'filtered_blastp.tsv' script: """ - sort "${blastp}" > blastp_s - sort "${protein_length}" > protein_length_s - join -1 1 -2 1 "blastp_s" "protein_length_s" > join1.tsv - sort -k 2 "join1.tsv" > join1.tsv_s - join -1 2 -2 1 "join1.tsv_s" "protein_length_s" > 'joined.blastp.tsv' - awk -f "${baseDir}/scripts/filter_blastp.awk" -v coverage="${min_coverage}" -v identity "${min_identity}" \ + sort -k 1 "${blastp}" > blastp_s + sort -k 1 "${protein_length}" > protein_length_s + join -1 1 -2 1 -t'\t' blastp_s protein_length_s > join1.tsv + sort -k 2 join1.tsv > join1.tsv_s + join -1 2 -2 1 -t'\t' join1.tsv_s' 'protein_length_s' > 'joined.blastp.tsv' + awk -f "${baseDir}/scripts/filter_blastp.awk" \ + -v coverage="${min_coverage}" \ + -v identity="${min_identity}" \ "${blastp}" > 'filtered_blastp.tsv' """ } diff --git a/workflow/scripts/filter_blastp_sequence_id.awk b/workflow/scripts/filter_blastp_sequence_id.awk new file mode 100644 index 0000000..7b9fc28 --- /dev/null +++ b/workflow/scripts/filter_blastp_sequence_id.awk @@ -0,0 +1,21 @@ +#!/usr/bin/env -S awk -f +# Filter BLASTp format 6 file +# to keep only the records +# with ID in records.list +# +# Usage: +# awk -f filter_blastp_sequence_id.awk \ +# records.list records.blastp.tsv + +NR == FNR { + sequence_id = $1 + to_remove[sequence_id] = 1 + next +} + +{ + sequence_id = $1 + if (!(sequence_id in to_remove)) { + print $0 + } +} diff --git a/workflow/scripts/keep_heaviest_edge_abc.awk b/workflow/scripts/keep_heaviest_edge_abc.awk new file mode 100644 index 0000000..1b6742f --- /dev/null +++ b/workflow/scripts/keep_heaviest_edge_abc.awk @@ -0,0 +1,30 @@ +#!/usr/bin/env -S awk -f +# Usage: $0 file.abc > filtered_file.abc +BEGIN { + OFS = FS = "\t" +} +{ + a=$1 + b=$2 + c=$3 + if (a > b) { + tmp=a + a=b + b=tmp + } + if (!(b in graph[a])) { + graph[a][b] = c + } else { + if (graph[a][b] < c) { + graph[a][b] = c + } + } +} + +END { + for (a in graph) { + for (b in graph[a]) { + print a, b, graph[a][b] + } + } +} diff --git a/workflow/scripts/mcl_to_tsv.awk b/workflow/scripts/mcl_to_tsv.awk index 861843a..d981c34 100644 --- a/workflow/scripts/mcl_to_tsv.awk +++ b/workflow/scripts/mcl_to_tsv.awk @@ -5,12 +5,12 @@ BEGIN { family_identifier=0 OFS="\t" + FS=" " } { family_identifier++ - split($0, gene_list, " ") - for (gene in gene_list) { - print gene, family_identifier + for (i=1; i <= NF; i++) { + print $i, family_identifier } } diff --git a/workflow/scripts/protein_lengths.awk b/workflow/scripts/protein_lengths.awk index 6db9d5a..31b0012 100644 --- a/workflow/scripts/protein_lengths.awk +++ b/workflow/scripts/protein_lengths.awk @@ -1,19 +1,25 @@ #!/usr/bin/env -S awk -f -# Associate a isoform id to the length of the sequence +# Associate a isoform id +# to the length of the sequence +# and the associated gene id BEGIN { sequence_length=0 isoform_id="" + gene_id="" + + OFS="\t" } /^>/ { if (isoform_id != "") { - print isoform_id, sequence_length - } else { - isoform_id = $1 - gsub(">", "", isoform_id) - sequence_length = 0 + print isoform_id, sequence_length, gene_id } + isoform_id = $1 + gene_id = $4 + gsub(">", "", isoform_id) + gsub("gene:", "", gene_id) + sequence_length = 0 } /^[^>]/ { @@ -22,6 +28,6 @@ BEGIN { END { if (isoform_id != "") { - print isoform_id, sequence_length + print isoform_id, sequence_length, gene_id } }