Update workflow

2024-11-04 11:37:20 +01:00 · 2024-11-04 11:37:20 +01:00 · 53f6a60943
parent 0540e3699e
commit 53f6a60943
8 changed files with 105 additions and 23 deletions
--- a/workflow/main.nf
+++ b/workflow/main.nf
@ -8,18 +8,26 @@
 nextflow.enable.dsl = 2;

 include { GUNZIP } from "./modules/gunzip.nf"
-include { BLAST_MAKEBLASTDB } from "./modules/blast.nf"
-include { BLAST_BLASTP } from "./modules/blast.nf"
+include { BLAST_ALL_AGAINST_ALL } from "./modules/blast.nf"
 include { FILTER_FASTA } from "./modules/filter_fasta.nf"
 include { FILTER_BLASTP } from "./modules/filter_blastp.nf"
 include { CLUSTERING } from "./modules/clustering.nf"

+process PROTEIN_GENE_MAPPING {
+
+    input:
+    path proteome
+
+    output:
+    path 'protein_gene.tsv'
+}
+
 workflow {
    proteome = Channel.fromPath(params.proteome)
    GUNZIP(proteome)
    FILTER_FASTA(GUNZIP.out)
-    BLAST_MAKEBLASTDB(params.species, FILTER_FASTA.out.proteome)
-    BLAST_BLASTP(params.species, FILTER_FASTA.out.proteome, BLAST_MAKEBLASTDB.out)
-    FILTER_BLASTP(BLAST_BLASTP.out, FILTER_FASTA.out.protein_length)
+    BLAST_ALL_AGAINST_ALL(FILTER_FASTA.out.proteome)
+    FILTER_BLASTP(params.min_coverage, params.min_identity, BLAST_ALL_AGAINST_ALL.out, FILTER_FASTA.out.lengths)
+
    CLUSTERING(FILTER_BLASTP.out)
 }
--- a/workflow/modules/blast.nf
+++ b/workflow/modules/blast.nf
@ -34,6 +34,19 @@ process BLAST_BLASTP {

    script:
        """
-    blastp -query "${proteome}" -db 'db' -outfmt '6'  -out "${species}.all-against-all.blastp.tsv"
+    blastp -query "${proteome}" -db 'db' -outfmt '6' -num_threads 7  -out "${species}.all-against-all.blastp.tsv"
        """
 }
+
+workflow BLAST_ALL_AGAINST_ALL {
+
+    take:
+    proteome
+
+    main:
+    BLAST_MAKEBLASTDB(params.species, proteome)
+    BLAST_BLASTP(params.species, proteome, BLAST_MAKEBLASTDB.out)
+
+    emit:
+    BLAST_BLASTP.out
+}
--- a/workflow/modules/clustering.nf
+++ b/workflow/modules/clustering.nf
@ -46,7 +46,7 @@ workflow CLUSTERING {

    main:
    BLASTP_TO_ABC(blastp_tsv)
-    MCL(BLASTP_TO_ABC).out
+    MCL(BLASTP_TO_ABC.out)
    MCL_TO_TSV(MCL.out)

    emit:
--- a/workflow/modules/filter_blastp.nf
+++ b/workflow/modules/filter_blastp.nf
@ -1,6 +1,7 @@
 /** Filter blastp output based on coverage and identity percentage
 /**/

+
 process FILTER_BLASTP {

    input:
@ -8,18 +9,21 @@ process FILTER_BLASTP {
    val min_identity
    path blastp
    path protein_length
+    path protein_gene

    output:
    path 'filtered_blastp.tsv'

    script:
    """
-    sort "${blastp}" > blastp_s
-    sort "${protein_length}" > protein_length_s
-    join -1 1 -2 1 "blastp_s" "protein_length_s" > join1.tsv
-    sort -k 2 "join1.tsv" > join1.tsv_s
-    join -1 2 -2 1 "join1.tsv_s" "protein_length_s" > 'joined.blastp.tsv'
-    awk -f "${baseDir}/scripts/filter_blastp.awk" -v coverage="${min_coverage}" -v identity "${min_identity}" \
+    sort -k 1 "${blastp}" > blastp_s
+    sort -k 1 "${protein_length}" > protein_length_s
+    join -1 1 -2 1 -t'\t' blastp_s protein_length_s > join1.tsv
+    sort -k 2 join1.tsv > join1.tsv_s
+    join -1 2 -2 1 -t'\t' join1.tsv_s' 'protein_length_s' > 'joined.blastp.tsv'
+    awk -f "${baseDir}/scripts/filter_blastp.awk" \
+        -v coverage="${min_coverage}" \
+        -v identity="${min_identity}" \
        "${blastp}" > 'filtered_blastp.tsv'
    """
 }
--- a/workflow/scripts/filter_blastp_sequence_id.awk
+++ b/workflow/scripts/filter_blastp_sequence_id.awk
@ -0,0 +1,21 @@
+#!/usr/bin/env -S awk -f
+# Filter BLASTp format 6 file
+# to keep only the records
+# with ID in records.list
+#
+# Usage:
+# awk -f filter_blastp_sequence_id.awk \
+# records.list records.blastp.tsv
+
+NR == FNR {
+    sequence_id = $1
+    to_remove[sequence_id] = 1
+    next
+}
+
+{
+    sequence_id = $1
+    if (!(sequence_id in to_remove)) {
+        print $0
+    }
+}
--- a/workflow/scripts/keep_heaviest_edge_abc.awk
+++ b/workflow/scripts/keep_heaviest_edge_abc.awk
@ -0,0 +1,30 @@
+#!/usr/bin/env -S awk -f
+# Usage: $0 file.abc > filtered_file.abc
+BEGIN {
+    OFS = FS = "\t"
+}
+{
+    a=$1
+    b=$2
+    c=$3
+    if (a > b) {
+        tmp=a
+        a=b
+        b=tmp
+    }
+    if (!(b in graph[a])) {
+        graph[a][b] = c
+    } else {
+        if (graph[a][b] < c) {
+            graph[a][b] = c
+        }
+    }
+}
+
+END {
+    for (a in graph) {
+        for (b in graph[a]) {
+            print a, b, graph[a][b]
+        }
+    }
+}
--- a/workflow/scripts/mcl_to_tsv.awk
+++ b/workflow/scripts/mcl_to_tsv.awk
@ -5,12 +5,12 @@
 BEGIN {
    family_identifier=0
    OFS="\t"
+    FS=" "
 }

 {
    family_identifier++
-    split($0, gene_list, " ")
-    for (gene in gene_list) {
-        print gene, family_identifier
+    for (i=1; i <= NF; i++) {
+        print $i, family_identifier
    }
 }
--- a/workflow/scripts/protein_lengths.awk
+++ b/workflow/scripts/protein_lengths.awk
@ -1,19 +1,25 @@
 #!/usr/bin/env -S awk -f
-# Associate a isoform id to the length of the sequence
+# Associate a isoform id
+# to the length of the sequence
+# and the associated gene id

 BEGIN {
    sequence_length=0
    isoform_id=""
+    gene_id=""
+
+    OFS="\t"
 }

 /^>/ {
    if (isoform_id != "") {
-        print isoform_id, sequence_length
-    } else {
-        isoform_id = $1
-        gsub(">", "", isoform_id)
-        sequence_length = 0
+        print isoform_id, sequence_length, gene_id
    }
+    isoform_id = $1
+    gene_id = $4
+    gsub(">", "", isoform_id)
+    gsub("gene:", "", gene_id)
+    sequence_length = 0
 }

 /^[^>]/ {
@ -22,6 +28,6 @@ BEGIN {

 END {
    if (isoform_id != "") {
-        print isoform_id, sequence_length
+        print isoform_id, sequence_length, gene_id
    }
 }