Update workflow

2024-11-04 11:37:20 +01:00 · 2024-11-04 11:37:20 +01:00 · 53f6a60943
parent 0540e3699e
commit 53f6a60943
8 changed files with 105 additions and 23 deletions
--- a/workflow/main.nf
+++ b/workflow/main.nf
@ -8,18 +8,26 @@
 nextflow.enable.dsl = 2;
 include { GUNZIP } from "./modules/gunzip.nf"
-include { BLAST_MAKEBLASTDB } from "./modules/blast.nf"
+include { BLAST_ALL_AGAINST_ALL } from "./modules/blast.nf"
 include { BLAST_BLASTP } from "./modules/blast.nf"
 include { FILTER_FASTA } from "./modules/filter_fasta.nf"
 include { FILTER_BLASTP } from "./modules/filter_blastp.nf"
 include { CLUSTERING } from "./modules/clustering.nf"
 process PROTEIN_GENE_MAPPING {
    input:
    path proteome
    output:
    path 'protein_gene.tsv'
 }
 workflow {
    proteome = Channel.fromPath(params.proteome)
    GUNZIP(proteome)
    FILTER_FASTA(GUNZIP.out)
-    BLAST_MAKEBLASTDB(params.species, FILTER_FASTA.out.proteome)
+    BLAST_ALL_AGAINST_ALL(FILTER_FASTA.out.proteome)
-    BLAST_BLASTP(params.species, FILTER_FASTA.out.proteome, BLAST_MAKEBLASTDB.out)
+    FILTER_BLASTP(params.min_coverage, params.min_identity, BLAST_ALL_AGAINST_ALL.out, FILTER_FASTA.out.lengths)
-    FILTER_BLASTP(BLAST_BLASTP.out, FILTER_FASTA.out.protein_length)
+
    CLUSTERING(FILTER_BLASTP.out)
 }
--- a/workflow/modules/blast.nf
+++ b/workflow/modules/blast.nf
@ -34,6 +34,19 @@ process BLAST_BLASTP {
    script:
        """
-    blastp -query "${proteome}" -db 'db' -outfmt '6'  -out "${species}.all-against-all.blastp.tsv"
+    blastp -query "${proteome}" -db 'db' -outfmt '6' -num_threads 7  -out "${species}.all-against-all.blastp.tsv"
        """
 }
 workflow BLAST_ALL_AGAINST_ALL {
    take:
    proteome
    main:
    BLAST_MAKEBLASTDB(params.species, proteome)
    BLAST_BLASTP(params.species, proteome, BLAST_MAKEBLASTDB.out)
    emit:
    BLAST_BLASTP.out
 }
--- a/workflow/modules/clustering.nf
+++ b/workflow/modules/clustering.nf
@ -46,7 +46,7 @@ workflow CLUSTERING {
    main:
    BLASTP_TO_ABC(blastp_tsv)
-    MCL(BLASTP_TO_ABC).out
+    MCL(BLASTP_TO_ABC.out)
    MCL_TO_TSV(MCL.out)
    emit:
--- a/workflow/modules/filter_blastp.nf
+++ b/workflow/modules/filter_blastp.nf
@ -1,6 +1,7 @@
 /** Filter blastp output based on coverage and identity percentage
 /**/
 process FILTER_BLASTP {
    input:
@ -8,18 +9,21 @@ process FILTER_BLASTP {
    val min_identity
    path blastp
    path protein_length
    path protein_gene
    output:
    path 'filtered_blastp.tsv'
    script:
    """
-    sort "${blastp}" > blastp_s
+    sort -k 1 "${blastp}" > blastp_s
-    sort "${protein_length}" > protein_length_s
+    sort -k 1 "${protein_length}" > protein_length_s
-    join -1 1 -2 1 "blastp_s" "protein_length_s" > join1.tsv
+    join -1 1 -2 1 -t'\t' blastp_s protein_length_s > join1.tsv
-    sort -k 2 "join1.tsv" > join1.tsv_s
+    sort -k 2 join1.tsv > join1.tsv_s
-    join -1 2 -2 1 "join1.tsv_s" "protein_length_s" > 'joined.blastp.tsv'
+    join -1 2 -2 1 -t'\t' join1.tsv_s' 'protein_length_s' > 'joined.blastp.tsv'
-    awk -f "${baseDir}/scripts/filter_blastp.awk" -v coverage="${min_coverage}" -v identity "${min_identity}" \
+    awk -f "${baseDir}/scripts/filter_blastp.awk" \
        -v coverage="${min_coverage}" \
        -v identity="${min_identity}" \
        "${blastp}" > 'filtered_blastp.tsv'
    """
 }
--- a/workflow/scripts/filter_blastp_sequence_id.awk
+++ b/workflow/scripts/filter_blastp_sequence_id.awk
@ -0,0 +1,21 @@
 #!/usr/bin/env -S awk -f
 # Filter BLASTp format 6 file
 # to keep only the records
 # with ID in records.list
 #
 # Usage:
 # awk -f filter_blastp_sequence_id.awk \
 # records.list records.blastp.tsv
 NR == FNR {
    sequence_id = $1
    to_remove[sequence_id] = 1
    next
 }
 {
    sequence_id = $1
    if (!(sequence_id in to_remove)) {
        print $0
    }
 }
--- a/workflow/scripts/keep_heaviest_edge_abc.awk
+++ b/workflow/scripts/keep_heaviest_edge_abc.awk
@ -0,0 +1,30 @@
 #!/usr/bin/env -S awk -f
 # Usage: $0 file.abc > filtered_file.abc
 BEGIN {
    OFS = FS = "\t"
 }
 {
    a=$1
    b=$2
    c=$3
    if (a > b) {
        tmp=a
        a=b
        b=tmp
    }
    if (!(b in graph[a])) {
        graph[a][b] = c
    } else {
        if (graph[a][b] < c) {
            graph[a][b] = c
        }
    }
 }
 END {
    for (a in graph) {
        for (b in graph[a]) {
            print a, b, graph[a][b]
        }
    }
 }
--- a/workflow/scripts/mcl_to_tsv.awk
+++ b/workflow/scripts/mcl_to_tsv.awk
@ -5,12 +5,12 @@
 BEGIN {
    family_identifier=0
    OFS="\t"
    FS=" "
 }
 {
    family_identifier++
-    split($0, gene_list, " ")
+    for (i=1; i <= NF; i++) {
-    for (gene in gene_list) {
+        print $i, family_identifier
        print gene, family_identifier
    }
 }
--- a/workflow/scripts/protein_lengths.awk
+++ b/workflow/scripts/protein_lengths.awk
@ -1,19 +1,25 @@
 #!/usr/bin/env -S awk -f
-# Associate a isoform id to the length of the sequence
+# Associate a isoform id
 # to the length of the sequence
 # and the associated gene id
 BEGIN {
    sequence_length=0
    isoform_id=""
    gene_id=""
    OFS="\t"
 }
 /^>/ {
    if (isoform_id != "") {
-        print isoform_id, sequence_length
+        print isoform_id, sequence_length, gene_id
    } else {
        isoform_id = $1
        gsub(">", "", isoform_id)
        sequence_length = 0
    }
    isoform_id = $1
    gene_id = $4
    gsub(">", "", isoform_id)
    gsub("gene:", "", gene_id)
    sequence_length = 0
 }
 /^[^>]/ {
@ -22,6 +28,6 @@ BEGIN {
 END {
    if (isoform_id != "") {
-        print isoform_id, sequence_length
+        print isoform_id, sequence_length, gene_id
    }
 }