From 3e17d75618d191384d401b960599a5a903a962fc Mon Sep 17 00:00:00 2001
From: Samuel Ortion <samuel@ortion.fr>
Date: Sun, 29 Dec 2024 14:11:49 +0100
Subject: [PATCH] Moved families and TAG workflow to subfolder

---
 workflow/.gitignore                           |  2 +
 .../{ => families_and_TAGs}/conda/blast.yml   |  0
 .../{ => families_and_TAGs}/conda/mcl.yml     |  0
 workflow/families_and_TAGs/main.nf            | 27 +++++++
 .../{ => families_and_TAGs}/modules/blast.nf  |  0
 .../modules/clustering.nf                     | 16 ++++-
 .../modules/filter_blastp.awk                 |  0
 .../modules/filter_blastp.nf                  | 72 +++++++++++++++++++
 .../modules/filter_fasta.nf                   |  0
 .../{ => families_and_TAGs}/modules/gunzip.nf |  0
 workflow/families_and_TAGs/modules/tags.nf    | 44 ++++++++++++
 .../scripts/filter_blastp.awk                 |  0
 .../scripts/filter_blastp_sequence_id.awk     |  0
 .../scripts/filter_longest.awk                |  0
 .../scripts/filter_records_fasta.awk          |  0
 .../scripts/gff3_to_gene_positions_table.awk  | 22 ++++++
 .../scripts/keep_heaviest_edge_abc.awk        |  0
 .../scripts/mcl_to_tsv.awk                    |  6 +-
 .../scripts/protein_lengths.awk               |  0
 .../scripts/remove_supercontigs.awk           |  0
 workflow/main.nf                              | 33 ---------
 workflow/modules/filter_blastp.nf             | 29 --------
 workflow/nextflow.config                      | 21 ------
 23 files changed, 186 insertions(+), 86 deletions(-)
 create mode 100644 workflow/.gitignore
 rename workflow/{ => families_and_TAGs}/conda/blast.yml (100%)
 rename workflow/{ => families_and_TAGs}/conda/mcl.yml (100%)
 create mode 100644 workflow/families_and_TAGs/main.nf
 rename workflow/{ => families_and_TAGs}/modules/blast.nf (100%)
 rename workflow/{ => families_and_TAGs}/modules/clustering.nf (68%)
 rename workflow/{ => families_and_TAGs}/modules/filter_blastp.awk (100%)
 create mode 100644 workflow/families_and_TAGs/modules/filter_blastp.nf
 rename workflow/{ => families_and_TAGs}/modules/filter_fasta.nf (100%)
 rename workflow/{ => families_and_TAGs}/modules/gunzip.nf (100%)
 create mode 100644 workflow/families_and_TAGs/modules/tags.nf
 rename workflow/{ => families_and_TAGs}/scripts/filter_blastp.awk (100%)
 rename workflow/{ => families_and_TAGs}/scripts/filter_blastp_sequence_id.awk (100%)
 rename workflow/{ => families_and_TAGs}/scripts/filter_longest.awk (100%)
 rename workflow/{ => families_and_TAGs}/scripts/filter_records_fasta.awk (100%)
 create mode 100644 workflow/families_and_TAGs/scripts/gff3_to_gene_positions_table.awk
 rename workflow/{ => families_and_TAGs}/scripts/keep_heaviest_edge_abc.awk (100%)
 rename workflow/{ => families_and_TAGs}/scripts/mcl_to_tsv.awk (66%)
 rename workflow/{ => families_and_TAGs}/scripts/protein_lengths.awk (100%)
 rename workflow/{ => families_and_TAGs}/scripts/remove_supercontigs.awk (100%)
 delete mode 100644 workflow/main.nf
 delete mode 100644 workflow/modules/filter_blastp.nf
 delete mode 100644 workflow/nextflow.config

diff --git a/workflow/.gitignore b/workflow/.gitignore
new file mode 100644
index 0000000..de6c3af
--- /dev/null
+++ b/workflow/.gitignore
@@ -0,0 +1,2 @@
+.nextflow.log*
+
diff --git a/workflow/conda/blast.yml b/workflow/families_and_TAGs/conda/blast.yml
similarity index 100%
rename from workflow/conda/blast.yml
rename to workflow/families_and_TAGs/conda/blast.yml
diff --git a/workflow/conda/mcl.yml b/workflow/families_and_TAGs/conda/mcl.yml
similarity index 100%
rename from workflow/conda/mcl.yml
rename to workflow/families_and_TAGs/conda/mcl.yml
diff --git a/workflow/families_and_TAGs/main.nf b/workflow/families_and_TAGs/main.nf
new file mode 100644
index 0000000..9b354d3
--- /dev/null
+++ b/workflow/families_and_TAGs/main.nf
@@ -0,0 +1,27 @@
+/**
+/** Comparative Genomics workflow
+/**
+/** This workflow find the duplicate genes from a proteome
+/** Then, It finds the Tandemly Arrayed Genes (TAGs)
+/**/
+include { GUNZIP as GUNZIP_1 } from "./modules/gunzip.nf"
+include { GUNZIP as GUNZIP_2 } from "./modules/gunzip.nf"
+// include { BLAST_ALL_AGAINST_ALL } from "./modules/blast.nf"
+include { FILTER_FASTA } from "./modules/filter_fasta.nf"
+include { FILTER_BLASTP } from "./modules/filter_blastp.nf"
+include { CLUSTERING } from "./modules/clustering.nf"
+include { TAGs } from "./modules/tags.nf"
+
+workflow {
+    proteome = Channel.fromPath(params.proteome)
+    gff3 = Channel.fromPath(params.gff3)
+    GUNZIP_1(proteome)
+    GUNZIP_2(gff3)
+    FILTER_FASTA(GUNZIP_1.out)
+//    BLAST_ALL_AGAINST_ALL(FILTER_FASTA.out.proteome)
+//    FILTER_BLASTP(params.min_coverage, params.min_identity, BLAST_ALL_AGAINST_ALL.out, FILTER_FASTA.out.lengths)
+
+ //   CLUSTERING(FILTER_BLASTP.out)
+
+  //  TAGs(CLUSTERING.out, GUNZIP_2.out)
+}
diff --git a/workflow/modules/blast.nf b/workflow/families_and_TAGs/modules/blast.nf
similarity index 100%
rename from workflow/modules/blast.nf
rename to workflow/families_and_TAGs/modules/blast.nf
diff --git a/workflow/modules/clustering.nf b/workflow/families_and_TAGs/modules/clustering.nf
similarity index 68%
rename from workflow/modules/clustering.nf
rename to workflow/families_and_TAGs/modules/clustering.nf
index 213be83..a850ac6 100644
--- a/workflow/modules/clustering.nf
+++ b/workflow/families_and_TAGs/modules/clustering.nf
@@ -8,7 +8,21 @@ process BLASTP_TO_ABC {
 
     script:
     """
-    awk 'BEGIN { OFS="\t" } { print \$14, \$16, \$12 }' "${blastp}" > 'graph.abc'
+    awk 'BEGIN { FS = OFS="\t" } { print \$14, \$16, \$12 }' "${blastp}" > 'graph.abc'
+    """
+}
+
+process KEEP_HEAVIEST_EDGE {
+
+    input:
+    path abc
+
+    output:
+    path 'heaviest.abc'
+
+    script:
+    """
+    awk -f "${baseDir}/scripts/keep_heaviest_edge_abc.awk" "${abc}" > 'heaviest.abc'
     """
 }
 
diff --git a/workflow/modules/filter_blastp.awk b/workflow/families_and_TAGs/modules/filter_blastp.awk
similarity index 100%
rename from workflow/modules/filter_blastp.awk
rename to workflow/families_and_TAGs/modules/filter_blastp.awk
diff --git a/workflow/families_and_TAGs/modules/filter_blastp.nf b/workflow/families_and_TAGs/modules/filter_blastp.nf
new file mode 100644
index 0000000..af27b95
--- /dev/null
+++ b/workflow/families_and_TAGs/modules/filter_blastp.nf
@@ -0,0 +1,72 @@
+/** Filter blastp output based on coverage and identity percentage
+/**/
+
+process REMOVE_LOOPS {
+
+    input:
+    path blastp
+
+    output:
+    path 'noloops.blastp.tsv'
+
+    script:
+
+    """
+    awk 'BEGIN { FS=OFS="\t" }  \$1 != \$2 { print \$0 }'
+    """
+}
+
+process ADD_GENE_ID_AND_PROTEIN_LENGTHS {
+
+    input:
+    path blastp
+    path protein_length
+
+    output:
+    path 'joined.blastp.tsv'
+
+    script:
+    """
+    sort -k 1 "${blastp}" > blastp_s
+    sort -k 1 "${protein_length}" > protein_length_s
+    join -1 1 -2 1 -t \$'\t' blastp_s protein_length_s > join1.tsv
+    sort -k 2 join1.tsv > join1.tsv_s
+    join -1 2 -2 1 -t \$'\t' join1.tsv_s protein_length_s > joined.blastp.tsv
+    """
+
+}
+
+process FILTER_COVERAGE_IDENTITY_BLASTP {
+
+    input:
+    val min_coverage
+    val min_identity
+    path blastp
+
+    output:
+    path 'filtered.blastp.tsv'
+
+    script:
+    """
+    awk -f "${baseDir}/scripts/filter_blastp.awk" \
+        -v coverage="${min_coverage}" \
+        -v identity="${min_identity}" \
+        "${blastp}" > 'filtered.blastp.tsv'
+    """
+}
+
+workflow FILTER_BLASTP {
+    take:
+    blastp
+    protein_length
+    min_coverage
+    min_identity
+
+    main:
+    REMOVE_LOOPS(blastp)
+    ADD_GENE_ID_AND_PROTEIN_LENGTHS(REMOVE_LOOPS.out)
+    FILTER_COVERAGE_IDENTITY_BLASTP(min_identity, min_coverage, ADD_GENE_ID_AND_PROTEIN_LENGTHS.out)
+
+    emit:
+    blastp=FILTER_COVERAGE_IDENTITY_BLASTP
+}
diff --git a/workflow/modules/filter_fasta.nf b/workflow/families_and_TAGs/modules/filter_fasta.nf
similarity index 100%
rename from workflow/modules/filter_fasta.nf
rename to workflow/families_and_TAGs/modules/filter_fasta.nf
diff --git a/workflow/modules/gunzip.nf b/workflow/families_and_TAGs/modules/gunzip.nf
similarity index 100%
rename from workflow/modules/gunzip.nf
rename to workflow/families_and_TAGs/modules/gunzip.nf
diff --git a/workflow/families_and_TAGs/modules/tags.nf b/workflow/families_and_TAGs/modules/tags.nf
new file mode 100644
index 0000000..43ebfb7
--- /dev/null
+++ b/workflow/families_and_TAGs/modules/tags.nf
@@ -0,0 +1,44 @@
+process TAG_FINDER {
+
+    input:
+    path positions
+    path families
+
+    output:
+    path 'tags.tsv'
+
+    script:
+    """
+    "./${baseDir}/../../rust/tagfinder/target/release/tagfinder" --positions "${positions}" --families "${families}" > 'tags.tsv'
+    """
+}
+
+
+process GENE_POSITION_TABLE {
+
+    input:
+    path gff3
+
+    output:
+    path 'gene_positions.tsv'
+
+    script:
+    """
+    awk -f "${baseDir}/scripts/gff3_to_gene_positions_table.awk" "${gff3}" > 'gene_positions.tsv'
+    """
+}
+
+
+workflow TAGs {
+
+    take:
+    gff3
+    families
+
+    main:
+    GENE_POSITION_TABLE(gff3)
+    TAG_FINDER(GENE_POSITION_TABLE.out, families)
+
+    emit:
+    TAG_FINDER.out
+}
diff --git a/workflow/scripts/filter_blastp.awk b/workflow/families_and_TAGs/scripts/filter_blastp.awk
similarity index 100%
rename from workflow/scripts/filter_blastp.awk
rename to workflow/families_and_TAGs/scripts/filter_blastp.awk
diff --git a/workflow/scripts/filter_blastp_sequence_id.awk b/workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk
similarity index 100%
rename from workflow/scripts/filter_blastp_sequence_id.awk
rename to workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk
diff --git a/workflow/scripts/filter_longest.awk b/workflow/families_and_TAGs/scripts/filter_longest.awk
similarity index 100%
rename from workflow/scripts/filter_longest.awk
rename to workflow/families_and_TAGs/scripts/filter_longest.awk
diff --git a/workflow/scripts/filter_records_fasta.awk b/workflow/families_and_TAGs/scripts/filter_records_fasta.awk
similarity index 100%
rename from workflow/scripts/filter_records_fasta.awk
rename to workflow/families_and_TAGs/scripts/filter_records_fasta.awk
diff --git a/workflow/families_and_TAGs/scripts/gff3_to_gene_positions_table.awk b/workflow/families_and_TAGs/scripts/gff3_to_gene_positions_table.awk
new file mode 100644
index 0000000..94c6e9f
--- /dev/null
+++ b/workflow/families_and_TAGs/scripts/gff3_to_gene_positions_table.awk
@@ -0,0 +1,22 @@
+#!/usr/bin/env -S awk -f
+# Convert a standard GFF 3 file
+# into a custom TSV file
+# to be used in detect TAGs step
+# Usage: gff3_to_gene_positions_table.awk input.gff
+
+BEGIN {
+    OFS="\t"
+    selected["gene"] = 1
+}
+
+/^[^#]/ && selected[$3] {
+    chromosome=$1
+    info=$9
+    split(info, infoarr, ";")
+    geneid=infoarr[1]
+    gsub("ID=", "", geneid)
+    gsub("gene:", "", geneid) # in Ensembl GFF.
+    start=$4
+    end=$5
+    print geneid, chromosome, start, end
+}
diff --git a/workflow/scripts/keep_heaviest_edge_abc.awk b/workflow/families_and_TAGs/scripts/keep_heaviest_edge_abc.awk
similarity index 100%
rename from workflow/scripts/keep_heaviest_edge_abc.awk
rename to workflow/families_and_TAGs/scripts/keep_heaviest_edge_abc.awk
diff --git a/workflow/scripts/mcl_to_tsv.awk b/workflow/families_and_TAGs/scripts/mcl_to_tsv.awk
similarity index 66%
rename from workflow/scripts/mcl_to_tsv.awk
rename to workflow/families_and_TAGs/scripts/mcl_to_tsv.awk
index d981c34..e24afea 100644
--- a/workflow/scripts/mcl_to_tsv.awk
+++ b/workflow/families_and_TAGs/scripts/mcl_to_tsv.awk
@@ -10,7 +10,9 @@ BEGIN {
 
 {
     family_identifier++
-    for (i=1; i <= NF; i++) {
-        print $i, family_identifier
+    if (NF > 1) {
+        for (i=1; i <= NF; i++) {
+            print $i, family_identifier
+        }
     }
 }
diff --git a/workflow/scripts/protein_lengths.awk b/workflow/families_and_TAGs/scripts/protein_lengths.awk
similarity index 100%
rename from workflow/scripts/protein_lengths.awk
rename to workflow/families_and_TAGs/scripts/protein_lengths.awk
diff --git a/workflow/scripts/remove_supercontigs.awk b/workflow/families_and_TAGs/scripts/remove_supercontigs.awk
similarity index 100%
rename from workflow/scripts/remove_supercontigs.awk
rename to workflow/families_and_TAGs/scripts/remove_supercontigs.awk
diff --git a/workflow/main.nf b/workflow/main.nf
deleted file mode 100644
index d556efc..0000000
--- a/workflow/main.nf
+++ /dev/null
@@ -1,33 +0,0 @@
-/**
-/** Comparative Genomics workflow
-/**
-/** This workflow find the duplicate genes from a proteome
-/** Then, It finds the Tandemly Arrayed Genes (TAGs)
-/**/
-
-nextflow.enable.dsl = 2;
-
-include { GUNZIP } from "./modules/gunzip.nf"
-include { BLAST_ALL_AGAINST_ALL } from "./modules/blast.nf"
-include { FILTER_FASTA } from "./modules/filter_fasta.nf"
-include { FILTER_BLASTP } from "./modules/filter_blastp.nf"
-include { CLUSTERING } from "./modules/clustering.nf"
-
-process PROTEIN_GENE_MAPPING {
-
-    input:
-    path proteome
-
-    output:
-    path 'protein_gene.tsv'
-}
-
-workflow {
-    proteome = Channel.fromPath(params.proteome)
-    GUNZIP(proteome)
-    FILTER_FASTA(GUNZIP.out)
-    BLAST_ALL_AGAINST_ALL(FILTER_FASTA.out.proteome)
-    FILTER_BLASTP(params.min_coverage, params.min_identity, BLAST_ALL_AGAINST_ALL.out, FILTER_FASTA.out.lengths)
-
-    CLUSTERING(FILTER_BLASTP.out)
-}
diff --git a/workflow/modules/filter_blastp.nf b/workflow/modules/filter_blastp.nf
deleted file mode 100644
index 4a54336..0000000
--- a/workflow/modules/filter_blastp.nf
+++ /dev/null
@@ -1,29 +0,0 @@
-/** Filter blastp output based on coverage and identity percentage
-/**/
-
-
-process FILTER_BLASTP {
-
-    input:
-    val min_coverage
-    val min_identity
-    path blastp
-    path protein_length
-    path protein_gene
-
-    output:
-    path 'filtered_blastp.tsv'
-
-    script:
-    """
-    sort -k 1 "${blastp}" > blastp_s
-    sort -k 1 "${protein_length}" > protein_length_s
-    join -1 1 -2 1 -t'\t' blastp_s protein_length_s > join1.tsv
-    sort -k 2 join1.tsv > join1.tsv_s
-    join -1 2 -2 1 -t'\t' join1.tsv_s' 'protein_length_s' > 'joined.blastp.tsv'
-    awk -f "${baseDir}/scripts/filter_blastp.awk" \
-        -v coverage="${min_coverage}" \
-        -v identity="${min_identity}" \
-        "${blastp}" > 'filtered_blastp.tsv'
-    """
-}
diff --git a/workflow/nextflow.config b/workflow/nextflow.config
deleted file mode 100644
index eb67cce..0000000
--- a/workflow/nextflow.config
+++ /dev/null
@@ -1,21 +0,0 @@
-params {
-    proteome = "${baseDir}/../../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz"
-    species = "Glycine_max"
-    results = "results"
-}
-
-profiles {
-
-    conda {
-        conda.enabled = true
-
-        process {
-            withLabel: blast {
-                conda = "$baseDir/conda/blast.yml"
-            }
-            withLabel: mcl {
-                conda = "$baseDir/conda/mcl.yml"
-            }
-        }
-    }
-}