commit 0540e3699eb87b159c0a1072da538f5101a861cd
Author: Samuel Ortion <samuel@ortion.fr>
Date:   Mon Oct 28 10:46:05 2024 +0100

    Initial Nextflow workflow

diff --git a/workflow/conda/blast.yml b/workflow/conda/blast.yml
new file mode 100644
index 0000000..2304cfd
--- /dev/null
+++ b/workflow/conda/blast.yml
@@ -0,0 +1,5 @@
+name: blast
+channels:
+  - bioconda
+dependencies:
+  - blast=2.5
diff --git a/workflow/conda/mcl.yml b/workflow/conda/mcl.yml
new file mode 100644
index 0000000..203d92e
--- /dev/null
+++ b/workflow/conda/mcl.yml
@@ -0,0 +1,5 @@
+name: mcl
+channels:
+  - bioconda
+dependencies:
+  - bioconda::mcl=22.282
diff --git a/workflow/main.nf b/workflow/main.nf
new file mode 100644
index 0000000..8e5af86
--- /dev/null
+++ b/workflow/main.nf
@@ -0,0 +1,25 @@
+/**
+/** Comparative Genomics workflow
+/**
+/** This workflow find the duplicate genes from a proteome
+/** Then, It finds the Tandemly Arrayed Genes (TAGs)
+/**/
+
+nextflow.enable.dsl = 2;
+
+include { GUNZIP } from "./modules/gunzip.nf"
+include { BLAST_MAKEBLASTDB } from "./modules/blast.nf"
+include { BLAST_BLASTP } from "./modules/blast.nf"
+include { FILTER_FASTA } from "./modules/filter_fasta.nf"
+include { FILTER_BLASTP } from "./modules/filter_blastp.nf"
+include { CLUSTERING } from "./modules/clustering.nf"
+
+workflow {
+    proteome = Channel.fromPath(params.proteome)
+    GUNZIP(proteome)
+    FILTER_FASTA(GUNZIP.out)
+    BLAST_MAKEBLASTDB(params.species, FILTER_FASTA.out.proteome)
+    BLAST_BLASTP(params.species, FILTER_FASTA.out.proteome, BLAST_MAKEBLASTDB.out)
+    FILTER_BLASTP(BLAST_BLASTP.out, FILTER_FASTA.out.protein_length)
+    CLUSTERING(FILTER_BLASTP.out)
+}
diff --git a/workflow/modules/blast.nf b/workflow/modules/blast.nf
new file mode 100644
index 0000000..aa095b8
--- /dev/null
+++ b/workflow/modules/blast.nf
@@ -0,0 +1,39 @@
+/** blastp all against all */
+
+process BLAST_MAKEBLASTDB {
+
+    label 'blast'
+
+    input:
+        val species
+        path proteome
+
+    output:
+        path 'db.phr'
+        path 'db.pin'
+        path 'db.psq'
+
+    script:
+        """
+    makeblastdb -in "${proteome}" -dbtype prot -out 'db' -title "${species}"
+        """
+}
+
+process BLAST_BLASTP {
+
+    label 'blast'
+
+    input:
+        val species
+        path proteome
+        path 'db.phr'
+        path 'db.pin'
+        path 'db.psq'
+    output:
+        path "${species}.all-against-all.blastp.tsv"
+
+    script:
+        """
+    blastp -query "${proteome}" -db 'db' -outfmt '6'  -out "${species}.all-against-all.blastp.tsv"
+        """
+}
diff --git a/workflow/modules/clustering.nf b/workflow/modules/clustering.nf
new file mode 100644
index 0000000..431760e
--- /dev/null
+++ b/workflow/modules/clustering.nf
@@ -0,0 +1,54 @@
+process BLASTP_TO_ABC {
+
+    input:
+    path blastp
+
+    output:
+    path 'graph.abc'
+
+    script:
+    """
+    awk 'BEGIN { OFS="\t" } { print \$14, \$16, \$12 }' "${blastp}" > 'graph.abc'
+    """
+}
+
+process MCL {
+
+    input:
+    path abc
+
+    output:
+    path 'clustering.mcl'
+
+    script:
+    """
+    mcl "${abc}" --abc -o 'custering.mcl'
+    """
+}
+
+process MCL_TO_TSV {
+
+    input:
+    path mcl
+
+    output:
+    path 'families.tsv'
+
+    script:
+    """
+    awk -f "${baseDir}/scripts/mcl_to_tsv.awk" "${mcl}" > 'families.tsv'
+    """
+}
+
+workflow CLUSTERING {
+    take:
+    blastp_tsv
+
+    main:
+    BLASTP_TO_ABC(blastp_tsv)
+    MCL(BLASTP_TO_ABC).out
+    MCL_TO_TSV(MCL.out)
+
+    emit:
+    families = MCL_TO_TSV.out
+}
diff --git a/workflow/modules/filter_blastp.awk b/workflow/modules/filter_blastp.awk
new file mode 100644
index 0000000..e69de29
diff --git a/workflow/modules/filter_blastp.nf b/workflow/modules/filter_blastp.nf
new file mode 100644
index 0000000..125f913
--- /dev/null
+++ b/workflow/modules/filter_blastp.nf
@@ -0,0 +1,25 @@
+/** Filter blastp output based on coverage and identity percentage
+/**/
+
+process FILTER_BLASTP {
+
+    input:
+    val min_coverage
+    val min_identity
+    path blastp
+    path protein_length
+
+    output:
+    path 'filtered_blastp.tsv'
+
+    script:
+    """
+    sort "${blastp}" > blastp_s
+    sort "${protein_length}" > protein_length_s
+    join -1 1 -2 1 "blastp_s" "protein_length_s" > join1.tsv
+    sort -k 2 "join1.tsv" > join1.tsv_s
+    join -1 2 -2 1 "join1.tsv_s" "protein_length_s" > 'joined.blastp.tsv'
+    awk -f "${baseDir}/scripts/filter_blastp.awk" -v coverage="${min_coverage}" -v identity "${min_identity}" \
+        "${blastp}" > 'filtered_blastp.tsv'
+    """
+}
diff --git a/workflow/modules/filter_fasta.nf b/workflow/modules/filter_fasta.nf
new file mode 100644
index 0000000..aada3b5
--- /dev/null
+++ b/workflow/modules/filter_fasta.nf
@@ -0,0 +1,57 @@
+/** 1. Filter out proteome sequence belonging to 'supercontig' */
+/** 2. Filter proteome to keep only the longest isoform */
+
+process FILTER_SUPERCONTIG {
+    input:
+    path proteome
+
+    output:
+    path 'proteome_nosupercontig.fasta'
+
+    script:
+    """
+    awk -f "${baseDir}/scripts/remove_supercontigs.awk" "${proteome}" > 'proteome_nosupercontig.fasta'
+    """
+}
+
+process SEQUENCE_LENGTH {
+    input:
+    path proteome
+
+    output:
+    path 'protein_lengths.tsv'
+
+    script:
+    """
+    awk -f "${baseDir}/scripts/protein_lengths.awk" "${proteome}" > 'protein_lengths.tsv'
+    """
+}
+
+process PROTEOME_LONGEST {
+    input:
+    path proteome
+    path protein_lengths
+
+    output:
+    path 'proteome_filtered.fa'
+
+    script:
+    """
+    awk -f "${baseDir}/scripts/filter_longest.awk" "${protein_lengths}" "${proteome}" > 'records_filtered.list'
+    awk -f "${baseDir}/scripts/filter_records_fasta.awk" 'records_filtered.list' "${proteome}" > 'proteome_filtered.fa'
+    """
+}
+
+workflow FILTER_FASTA {
+    take:
+    proteome
+
+    main:
+    FILTER_SUPERCONTIG(proteome)
+    SEQUENCE_LENGTH(FILTER_SUPERCONTIG.out)
+    PROTEOME_LONGEST(FILTER_SUPERCONTIG.out, SEQUENCE_LENGTH.out)
+
+    emit:
+    lengths = SEQUENCE_LENGTH.out
+    proteome = PROTEOME_LONGEST.out
+}
diff --git a/workflow/modules/gunzip.nf b/workflow/modules/gunzip.nf
new file mode 100644
index 0000000..2c68cf6
--- /dev/null
+++ b/workflow/modules/gunzip.nf
@@ -0,0 +1,12 @@
+process GUNZIP {
+    input:
+    path zipped_file
+
+    output:
+    path zipped_file.baseName
+
+    script:
+    """
+    gunzip -c "${zipped_file}" > "${zipped_file.baseName}"
+    """
+}
diff --git a/workflow/nextflow.config b/workflow/nextflow.config
new file mode 100644
index 0000000..eb67cce
--- /dev/null
+++ b/workflow/nextflow.config
@@ -0,0 +1,21 @@
+params {
+    proteome = "${baseDir}/../../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz"
+    species = "Glycine_max"
+    results = "results"
+}
+
+profiles {
+
+    conda {
+        conda.enabled = true
+
+        process {
+            withLabel: blast {
+                conda = "$baseDir/conda/blast.yml"
+            }
+            withLabel: mcl {
+                conda = "$baseDir/conda/mcl.yml"
+            }
+        }
+    }
+}
diff --git a/workflow/scripts/filter_blastp.awk b/workflow/scripts/filter_blastp.awk
new file mode 100644
index 0000000..ad03f75
--- /dev/null
+++ b/workflow/scripts/filter_blastp.awk
@@ -0,0 +1,35 @@
+#!/usr/bin/env -S awk -f
+# Usage: $0 -v min_identity=30 \
+#     -v min_coverage=75 \
+#     ./file.blastp.tsv
+# The blastp.tsv file should have
+# the four following supplementary colums
+#     query length,
+#     query gene ID,
+#     subject length,
+#     subject gene ID
+
+BEGIN {
+    OFS="\t"
+}
+
+function get_coverage(sequence_length, sequence_start, sequence_end) {
+    return (sequence_end - sequence_start + 1) / sequence_length * 100
+}
+
+{
+    qgeneid=$14
+    sgeneid=$16
+    qstart=$7
+    qend=$8
+    sstart=$9
+    send=$10
+    qlength=$13
+    slength=$15
+    identity=$3
+    evalue=$11
+    bitscore=$12
+    if (get_coverage(qlength, qstart, qend) > min_coverage && get_coverage(slength, sstart, send) > min_coverage && identity > min_identity) {
+        print $0
+    }
+}
diff --git a/workflow/scripts/filter_longest.awk b/workflow/scripts/filter_longest.awk
new file mode 100644
index 0000000..dc284f3
--- /dev/null
+++ b/workflow/scripts/filter_longest.awk
@@ -0,0 +1,34 @@
+#!/usr/bin/env -S awk -f
+# Extract the FASTA record id with the longest sequence.
+# Takes a two columns file with record id, record length
+# and the FASTA file.
+#
+# Usage: filter_longest.awk protein_lengths.tsv proteins.fasta
+
+NR==FNR {
+    gsub(">", "", $1)
+    protein_length[$1] = $2
+    next
+}
+
+{
+    gene_id = $4
+    gsub("gene:", "", gene_id)
+    isoform_id = $0
+    gsub(">", "", isoform_id)
+    if (gene_id in max_length) {
+        if (protein_length[isoform_id] > max_length[gene_id]) {
+            max_length[gene_id] = protein_length[isoform_id]
+            max_length_isoform[gene_id] = isoform_id
+        }
+    } else {
+        max_length[gene_id] = protein_length[isoform_id]
+        max_length_isoform[gene_id] = isoform_id
+    }
+}
+
+END {
+    for (gene_id in max_length_isoform) {
+        print max_length_isoform[gene_id]
+    }
+}
diff --git a/workflow/scripts/filter_records_fasta.awk b/workflow/scripts/filter_records_fasta.awk
new file mode 100644
index 0000000..a0590c9
--- /dev/null
+++ b/workflow/scripts/filter_records_fasta.awk
@@ -0,0 +1,32 @@
+#!/usr/bin/env -S awk -f
+# Filter FASTA file
+# to keep only the records
+# with ID in records.list
+#
+# Usage:
+# awk -f filter_record_fasta.awk \
+# records.list records.fasta
+
+BEGIN {
+    on_good_record = 0
+}
+
+NR == FNR {
+    sequence_id = $1
+    kept[sequence_id] = 1
+    next
+}
+
+/^>/ {
+    on_good_record = 0
+    sequence_id = $1
+    gsub(">", "", sequence_id)
+    if (sequence_id in kept) {
+        print $0
+        on_good_record = 1
+    }
+}
+
+/^[^>]/ && on_good_record {
+    print $0
+}
diff --git a/workflow/scripts/mcl_to_tsv.awk b/workflow/scripts/mcl_to_tsv.awk
new file mode 100644
index 0000000..861843a
--- /dev/null
+++ b/workflow/scripts/mcl_to_tsv.awk
@@ -0,0 +1,16 @@
+#!/usr/bin/env -S awk -f
+# Convert mcl output into tsv output
+# with two columns 'gene id, family identifier'
+
+BEGIN {
+    family_identifier=0
+    OFS="\t"
+}
+
+{
+    family_identifier++
+    split($0, gene_list, " ")
+    for (gene in gene_list) {
+        print gene, family_identifier
+    }
+}
diff --git a/workflow/scripts/protein_lengths.awk b/workflow/scripts/protein_lengths.awk
new file mode 100644
index 0000000..6db9d5a
--- /dev/null
+++ b/workflow/scripts/protein_lengths.awk
@@ -0,0 +1,27 @@
+#!/usr/bin/env -S awk -f
+# Associate a isoform id to the length of the sequence
+
+BEGIN {
+    sequence_length=0
+    isoform_id=""
+}
+
+/^>/ {
+    if (isoform_id != "") {
+        print isoform_id, sequence_length
+    } else {
+        isoform_id = $1
+        gsub(">", "", isoform_id)
+        sequence_length = 0
+    }
+}
+
+/^[^>]/ {
+    sequence_length += length($0)
+}
+
+END {
+    if (isoform_id != "") {
+        print isoform_id, sequence_length
+    }
+}
diff --git a/workflow/scripts/remove_supercontigs.awk b/workflow/scripts/remove_supercontigs.awk
new file mode 100644
index 0000000..04474c2
--- /dev/null
+++ b/workflow/scripts/remove_supercontigs.awk
@@ -0,0 +1,25 @@
+#!/usr/bin/env -S awk -f
+# Filter out 'supercontigs' records
+# from an Ensembl proteome fasta file
+# Usage:
+# awk -f remove_supercontigs.awk \
+# "proteome.fasta" \
+# > "proteome_nosupercontigs.fasta"
+
+BEGIN {
+    on_good_sequence = 0
+}
+
+/^>/ {
+    on_good_sequence = 0
+    locus = $3
+    split(locus, locus_array, ":")
+    if (locus_array[1] != "supercontig") {
+        on_good_sequence = 1
+        print $0
+    }
+}
+
+/^[^>]/ && on_good_sequence {
+    print $0
+}