Moved families and TAG workflow to subfolder

2024-12-29 14:11:49 +01:00 · 2024-12-29 14:11:49 +01:00 · 3e17d75618
parent 76852dfaf8
commit 3e17d75618
23 changed files with 186 additions and 86 deletions
--- a/workflow/.gitignore
+++ b/workflow/.gitignore
@ -0,0 +1,2 @@
 .nextflow.log*
--- a/workflow/families_and_TAGs/conda/blast.yml
+++ b/workflow/families_and_TAGs/conda/blast.yml
--- a/workflow/families_and_TAGs/conda/mcl.yml
+++ b/workflow/families_and_TAGs/conda/mcl.yml
--- a/workflow/families_and_TAGs/main.nf
+++ b/workflow/families_and_TAGs/main.nf
@ -0,0 +1,27 @@
 /**
 /** Comparative Genomics workflow
 /**
 /** This workflow find the duplicate genes from a proteome
 /** Then, It finds the Tandemly Arrayed Genes (TAGs)
 /**/
 include { GUNZIP as GUNZIP_1 } from "./modules/gunzip.nf"
 include { GUNZIP as GUNZIP_2 } from "./modules/gunzip.nf"
 // include { BLAST_ALL_AGAINST_ALL } from "./modules/blast.nf"
 include { FILTER_FASTA } from "./modules/filter_fasta.nf"
 include { FILTER_BLASTP } from "./modules/filter_blastp.nf"
 include { CLUSTERING } from "./modules/clustering.nf"
 include { TAGs } from "./modules/tags.nf"
 workflow {
    proteome = Channel.fromPath(params.proteome)
    gff3 = Channel.fromPath(params.gff3)
    GUNZIP_1(proteome)
    GUNZIP_2(gff3)
    FILTER_FASTA(GUNZIP_1.out)
 //    BLAST_ALL_AGAINST_ALL(FILTER_FASTA.out.proteome)
 //    FILTER_BLASTP(params.min_coverage, params.min_identity, BLAST_ALL_AGAINST_ALL.out, FILTER_FASTA.out.lengths)
 //   CLUSTERING(FILTER_BLASTP.out)
  //  TAGs(CLUSTERING.out, GUNZIP_2.out)
 }
--- a/workflow/families_and_TAGs/modules/blast.nf
+++ b/workflow/families_and_TAGs/modules/blast.nf
--- a/workflow/families_and_TAGs/modules/clustering.nf
+++ b/workflow/families_and_TAGs/modules/clustering.nf
@ -8,7 +8,21 @@ process BLASTP_TO_ABC {
    script:
    """
-    awk 'BEGIN { OFS="\t" } { print \$14, \$16, \$12 }' "${blastp}" > 'graph.abc'
+    awk 'BEGIN { FS = OFS="\t" } { print \$14, \$16, \$12 }' "${blastp}" > 'graph.abc'
    """
 }
 process KEEP_HEAVIEST_EDGE {
    input:
    path abc
    output:
    path 'heaviest.abc'
    script:
    """
    awk -f "${baseDir}/scripts/keep_heaviest_edge_abc.awk" "${abc}" > 'heaviest.abc'
    """
 }
--- a/workflow/families_and_TAGs/modules/filter_blastp.awk
+++ b/workflow/families_and_TAGs/modules/filter_blastp.awk
--- a/workflow/families_and_TAGs/modules/filter_blastp.nf
+++ b/workflow/families_and_TAGs/modules/filter_blastp.nf
@ -0,0 +1,72 @@
 /** Filter blastp output based on coverage and identity percentage
 /**/
 process REMOVE_LOOPS {
    input:
    path blastp
    output:
    path 'noloops.blastp.tsv'
    script:
    """
    awk 'BEGIN { FS=OFS="\t" }  \$1 != \$2 { print \$0 }'
    """
 }
 process ADD_GENE_ID_AND_PROTEIN_LENGTHS {
    input:
    path blastp
    path protein_length
    output:
    path 'joined.blastp.tsv'
    script:
    """
    sort -k 1 "${blastp}" > blastp_s
    sort -k 1 "${protein_length}" > protein_length_s
    join -1 1 -2 1 -t \$'\t' blastp_s protein_length_s > join1.tsv
    sort -k 2 join1.tsv > join1.tsv_s
    join -1 2 -2 1 -t \$'\t' join1.tsv_s protein_length_s > joined.blastp.tsv
    """
 }
 process FILTER_COVERAGE_IDENTITY_BLASTP {
    input:
    val min_coverage
    val min_identity
    path blastp
    output:
    path 'filtered.blastp.tsv'
    script:
    """
    awk -f "${baseDir}/scripts/filter_blastp.awk" \
        -v coverage="${min_coverage}" \
        -v identity="${min_identity}" \
        "${blastp}" > 'filtered.blastp.tsv'
    """
 }
 workflow FILTER_BLASTP {
    take:
    blastp
    protein_length
    min_coverage
    min_identity
    main:
    REMOVE_LOOPS(blastp)
    ADD_GENE_ID_AND_PROTEIN_LENGTHS(REMOVE_LOOPS.out)
    FILTER_COVERAGE_IDENTITY_BLASTP(min_identity, min_coverage, ADD_GENE_ID_AND_PROTEIN_LENGTHS.out)
    emit:
    blastp=FILTER_COVERAGE_IDENTITY_BLASTP
 }
--- a/workflow/families_and_TAGs/modules/filter_fasta.nf
+++ b/workflow/families_and_TAGs/modules/filter_fasta.nf
--- a/workflow/families_and_TAGs/modules/gunzip.nf
+++ b/workflow/families_and_TAGs/modules/gunzip.nf
--- a/workflow/families_and_TAGs/modules/tags.nf
+++ b/workflow/families_and_TAGs/modules/tags.nf
@ -0,0 +1,44 @@
 process TAG_FINDER {
    input:
    path positions
    path families
    output:
    path 'tags.tsv'
    script:
    """
    "./${baseDir}/../../rust/tagfinder/target/release/tagfinder" --positions "${positions}" --families "${families}" > 'tags.tsv'
    """
 }
 process GENE_POSITION_TABLE {
    input:
    path gff3
    output:
    path 'gene_positions.tsv'
    script:
    """
    awk -f "${baseDir}/scripts/gff3_to_gene_positions_table.awk" "${gff3}" > 'gene_positions.tsv'
    """
 }
 workflow TAGs {
    take:
    gff3
    families
    main:
    GENE_POSITION_TABLE(gff3)
    TAG_FINDER(GENE_POSITION_TABLE.out, families)
    emit:
    TAG_FINDER.out
 }
--- a/workflow/families_and_TAGs/scripts/filter_blastp.awk
+++ b/workflow/families_and_TAGs/scripts/filter_blastp.awk
--- a/workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk
+++ b/workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk
--- a/workflow/families_and_TAGs/scripts/filter_longest.awk
+++ b/workflow/families_and_TAGs/scripts/filter_longest.awk
--- a/workflow/families_and_TAGs/scripts/filter_records_fasta.awk
+++ b/workflow/families_and_TAGs/scripts/filter_records_fasta.awk
--- a/workflow/families_and_TAGs/scripts/gff3_to_gene_positions_table.awk
+++ b/workflow/families_and_TAGs/scripts/gff3_to_gene_positions_table.awk
@ -0,0 +1,22 @@
 #!/usr/bin/env -S awk -f
 # Convert a standard GFF 3 file
 # into a custom TSV file
 # to be used in detect TAGs step
 # Usage: gff3_to_gene_positions_table.awk input.gff
 BEGIN {
    OFS="\t"
    selected["gene"] = 1
 }
 /^[^#]/ && selected[$3] {
    chromosome=$1
    info=$9
    split(info, infoarr, ";")
    geneid=infoarr[1]
    gsub("ID=", "", geneid)
    gsub("gene:", "", geneid) # in Ensembl GFF.
    start=$4
    end=$5
    print geneid, chromosome, start, end
 }
--- a/workflow/families_and_TAGs/scripts/keep_heaviest_edge_abc.awk
+++ b/workflow/families_and_TAGs/scripts/keep_heaviest_edge_abc.awk
--- a/workflow/families_and_TAGs/scripts/mcl_to_tsv.awk
+++ b/workflow/families_and_TAGs/scripts/mcl_to_tsv.awk
@ -10,7 +10,9 @@ BEGIN {
 {
    family_identifier++
    if (NF > 1) {
        for (i=1; i <= NF; i++) {
            print $i, family_identifier
        }
    }
 }
--- a/workflow/families_and_TAGs/scripts/protein_lengths.awk
+++ b/workflow/families_and_TAGs/scripts/protein_lengths.awk
--- a/workflow/families_and_TAGs/scripts/remove_supercontigs.awk
+++ b/workflow/families_and_TAGs/scripts/remove_supercontigs.awk
--- a/workflow/main.nf
+++ b/workflow/main.nf
@ -1,33 +0,0 @@
 /**
 /** Comparative Genomics workflow
 /**
 /** This workflow find the duplicate genes from a proteome
 /** Then, It finds the Tandemly Arrayed Genes (TAGs)
 /**/
 nextflow.enable.dsl = 2;
 include { GUNZIP } from "./modules/gunzip.nf"
 include { BLAST_ALL_AGAINST_ALL } from "./modules/blast.nf"
 include { FILTER_FASTA } from "./modules/filter_fasta.nf"
 include { FILTER_BLASTP } from "./modules/filter_blastp.nf"
 include { CLUSTERING } from "./modules/clustering.nf"
 process PROTEIN_GENE_MAPPING {
    input:
    path proteome
    output:
    path 'protein_gene.tsv'
 }
 workflow {
    proteome = Channel.fromPath(params.proteome)
    GUNZIP(proteome)
    FILTER_FASTA(GUNZIP.out)
    BLAST_ALL_AGAINST_ALL(FILTER_FASTA.out.proteome)
    FILTER_BLASTP(params.min_coverage, params.min_identity, BLAST_ALL_AGAINST_ALL.out, FILTER_FASTA.out.lengths)
    CLUSTERING(FILTER_BLASTP.out)
 }
--- a/workflow/modules/filter_blastp.nf
+++ b/workflow/modules/filter_blastp.nf
@ -1,29 +0,0 @@
 /** Filter blastp output based on coverage and identity percentage
 /**/
 process FILTER_BLASTP {
    input:
    val min_coverage
    val min_identity
    path blastp
    path protein_length
    path protein_gene
    output:
    path 'filtered_blastp.tsv'
    script:
    """
    sort -k 1 "${blastp}" > blastp_s
    sort -k 1 "${protein_length}" > protein_length_s
    join -1 1 -2 1 -t'\t' blastp_s protein_length_s > join1.tsv
    sort -k 2 join1.tsv > join1.tsv_s
    join -1 2 -2 1 -t'\t' join1.tsv_s' 'protein_length_s' > 'joined.blastp.tsv'
    awk -f "${baseDir}/scripts/filter_blastp.awk" \
        -v coverage="${min_coverage}" \
        -v identity="${min_identity}" \
        "${blastp}" > 'filtered_blastp.tsv'
    """
 }
--- a/workflow/nextflow.config
+++ b/workflow/nextflow.config
@ -1,21 +0,0 @@
 params {
    proteome = "${baseDir}/../../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz"
    species = "Glycine_max"
    results = "results"
 }
 profiles {
    conda {
        conda.enabled = true
        process {
            withLabel: blast {
                conda = "$baseDir/conda/blast.yml"
            }
            withLabel: mcl {
                conda = "$baseDir/conda/mcl.yml"
            }
        }
    }
 }