From 3e17d75618d191384d401b960599a5a903a962fc Mon Sep 17 00:00:00 2001 From: Samuel Ortion Date: Sun, 29 Dec 2024 14:11:49 +0100 Subject: [PATCH] Moved families and TAG workflow to subfolder --- workflow/.gitignore | 2 + .../{ => families_and_TAGs}/conda/blast.yml | 0 .../{ => families_and_TAGs}/conda/mcl.yml | 0 workflow/families_and_TAGs/main.nf | 27 +++++++ .../{ => families_and_TAGs}/modules/blast.nf | 0 .../modules/clustering.nf | 16 ++++- .../modules/filter_blastp.awk | 0 .../modules/filter_blastp.nf | 72 +++++++++++++++++++ .../modules/filter_fasta.nf | 0 .../{ => families_and_TAGs}/modules/gunzip.nf | 0 workflow/families_and_TAGs/modules/tags.nf | 44 ++++++++++++ .../scripts/filter_blastp.awk | 0 .../scripts/filter_blastp_sequence_id.awk | 0 .../scripts/filter_longest.awk | 0 .../scripts/filter_records_fasta.awk | 0 .../scripts/gff3_to_gene_positions_table.awk | 22 ++++++ .../scripts/keep_heaviest_edge_abc.awk | 0 .../scripts/mcl_to_tsv.awk | 6 +- .../scripts/protein_lengths.awk | 0 .../scripts/remove_supercontigs.awk | 0 workflow/main.nf | 33 --------- workflow/modules/filter_blastp.nf | 29 -------- workflow/nextflow.config | 21 ------ 23 files changed, 186 insertions(+), 86 deletions(-) create mode 100644 workflow/.gitignore rename workflow/{ => families_and_TAGs}/conda/blast.yml (100%) rename workflow/{ => families_and_TAGs}/conda/mcl.yml (100%) create mode 100644 workflow/families_and_TAGs/main.nf rename workflow/{ => families_and_TAGs}/modules/blast.nf (100%) rename workflow/{ => families_and_TAGs}/modules/clustering.nf (68%) rename workflow/{ => families_and_TAGs}/modules/filter_blastp.awk (100%) create mode 100644 workflow/families_and_TAGs/modules/filter_blastp.nf rename workflow/{ => families_and_TAGs}/modules/filter_fasta.nf (100%) rename workflow/{ => families_and_TAGs}/modules/gunzip.nf (100%) create mode 100644 workflow/families_and_TAGs/modules/tags.nf rename workflow/{ => families_and_TAGs}/scripts/filter_blastp.awk (100%) rename workflow/{ => families_and_TAGs}/scripts/filter_blastp_sequence_id.awk (100%) rename workflow/{ => families_and_TAGs}/scripts/filter_longest.awk (100%) rename workflow/{ => families_and_TAGs}/scripts/filter_records_fasta.awk (100%) create mode 100644 workflow/families_and_TAGs/scripts/gff3_to_gene_positions_table.awk rename workflow/{ => families_and_TAGs}/scripts/keep_heaviest_edge_abc.awk (100%) rename workflow/{ => families_and_TAGs}/scripts/mcl_to_tsv.awk (66%) rename workflow/{ => families_and_TAGs}/scripts/protein_lengths.awk (100%) rename workflow/{ => families_and_TAGs}/scripts/remove_supercontigs.awk (100%) delete mode 100644 workflow/main.nf delete mode 100644 workflow/modules/filter_blastp.nf delete mode 100644 workflow/nextflow.config diff --git a/workflow/.gitignore b/workflow/.gitignore new file mode 100644 index 0000000..de6c3af --- /dev/null +++ b/workflow/.gitignore @@ -0,0 +1,2 @@ +.nextflow.log* + diff --git a/workflow/conda/blast.yml b/workflow/families_and_TAGs/conda/blast.yml similarity index 100% rename from workflow/conda/blast.yml rename to workflow/families_and_TAGs/conda/blast.yml diff --git a/workflow/conda/mcl.yml b/workflow/families_and_TAGs/conda/mcl.yml similarity index 100% rename from workflow/conda/mcl.yml rename to workflow/families_and_TAGs/conda/mcl.yml diff --git a/workflow/families_and_TAGs/main.nf b/workflow/families_and_TAGs/main.nf new file mode 100644 index 0000000..9b354d3 --- /dev/null +++ b/workflow/families_and_TAGs/main.nf @@ -0,0 +1,27 @@ +/** +/** Comparative Genomics workflow +/** +/** This workflow find the duplicate genes from a proteome +/** Then, It finds the Tandemly Arrayed Genes (TAGs) +/**/ +include { GUNZIP as GUNZIP_1 } from "./modules/gunzip.nf" +include { GUNZIP as GUNZIP_2 } from "./modules/gunzip.nf" +// include { BLAST_ALL_AGAINST_ALL } from "./modules/blast.nf" +include { FILTER_FASTA } from "./modules/filter_fasta.nf" +include { FILTER_BLASTP } from "./modules/filter_blastp.nf" +include { CLUSTERING } from "./modules/clustering.nf" +include { TAGs } from "./modules/tags.nf" + +workflow { + proteome = Channel.fromPath(params.proteome) + gff3 = Channel.fromPath(params.gff3) + GUNZIP_1(proteome) + GUNZIP_2(gff3) + FILTER_FASTA(GUNZIP_1.out) +// BLAST_ALL_AGAINST_ALL(FILTER_FASTA.out.proteome) +// FILTER_BLASTP(params.min_coverage, params.min_identity, BLAST_ALL_AGAINST_ALL.out, FILTER_FASTA.out.lengths) + + // CLUSTERING(FILTER_BLASTP.out) + + // TAGs(CLUSTERING.out, GUNZIP_2.out) +} diff --git a/workflow/modules/blast.nf b/workflow/families_and_TAGs/modules/blast.nf similarity index 100% rename from workflow/modules/blast.nf rename to workflow/families_and_TAGs/modules/blast.nf diff --git a/workflow/modules/clustering.nf b/workflow/families_and_TAGs/modules/clustering.nf similarity index 68% rename from workflow/modules/clustering.nf rename to workflow/families_and_TAGs/modules/clustering.nf index 213be83..a850ac6 100644 --- a/workflow/modules/clustering.nf +++ b/workflow/families_and_TAGs/modules/clustering.nf @@ -8,7 +8,21 @@ process BLASTP_TO_ABC { script: """ - awk 'BEGIN { OFS="\t" } { print \$14, \$16, \$12 }' "${blastp}" > 'graph.abc' + awk 'BEGIN { FS = OFS="\t" } { print \$14, \$16, \$12 }' "${blastp}" > 'graph.abc' + """ +} + +process KEEP_HEAVIEST_EDGE { + + input: + path abc + + output: + path 'heaviest.abc' + + script: + """ + awk -f "${baseDir}/scripts/keep_heaviest_edge_abc.awk" "${abc}" > 'heaviest.abc' """ } diff --git a/workflow/modules/filter_blastp.awk b/workflow/families_and_TAGs/modules/filter_blastp.awk similarity index 100% rename from workflow/modules/filter_blastp.awk rename to workflow/families_and_TAGs/modules/filter_blastp.awk diff --git a/workflow/families_and_TAGs/modules/filter_blastp.nf b/workflow/families_and_TAGs/modules/filter_blastp.nf new file mode 100644 index 0000000..af27b95 --- /dev/null +++ b/workflow/families_and_TAGs/modules/filter_blastp.nf @@ -0,0 +1,72 @@ +/** Filter blastp output based on coverage and identity percentage +/**/ + +process REMOVE_LOOPS { + + input: + path blastp + + output: + path 'noloops.blastp.tsv' + + script: + + """ + awk 'BEGIN { FS=OFS="\t" } \$1 != \$2 { print \$0 }' + """ +} + +process ADD_GENE_ID_AND_PROTEIN_LENGTHS { + + input: + path blastp + path protein_length + + output: + path 'joined.blastp.tsv' + + script: + """ + sort -k 1 "${blastp}" > blastp_s + sort -k 1 "${protein_length}" > protein_length_s + join -1 1 -2 1 -t \$'\t' blastp_s protein_length_s > join1.tsv + sort -k 2 join1.tsv > join1.tsv_s + join -1 2 -2 1 -t \$'\t' join1.tsv_s protein_length_s > joined.blastp.tsv + """ + +} + +process FILTER_COVERAGE_IDENTITY_BLASTP { + + input: + val min_coverage + val min_identity + path blastp + + output: + path 'filtered.blastp.tsv' + + script: + """ + awk -f "${baseDir}/scripts/filter_blastp.awk" \ + -v coverage="${min_coverage}" \ + -v identity="${min_identity}" \ + "${blastp}" > 'filtered.blastp.tsv' + """ +} + +workflow FILTER_BLASTP { + take: + blastp + protein_length + min_coverage + min_identity + + main: + REMOVE_LOOPS(blastp) + ADD_GENE_ID_AND_PROTEIN_LENGTHS(REMOVE_LOOPS.out) + FILTER_COVERAGE_IDENTITY_BLASTP(min_identity, min_coverage, ADD_GENE_ID_AND_PROTEIN_LENGTHS.out) + + emit: + blastp=FILTER_COVERAGE_IDENTITY_BLASTP +} diff --git a/workflow/modules/filter_fasta.nf b/workflow/families_and_TAGs/modules/filter_fasta.nf similarity index 100% rename from workflow/modules/filter_fasta.nf rename to workflow/families_and_TAGs/modules/filter_fasta.nf diff --git a/workflow/modules/gunzip.nf b/workflow/families_and_TAGs/modules/gunzip.nf similarity index 100% rename from workflow/modules/gunzip.nf rename to workflow/families_and_TAGs/modules/gunzip.nf diff --git a/workflow/families_and_TAGs/modules/tags.nf b/workflow/families_and_TAGs/modules/tags.nf new file mode 100644 index 0000000..43ebfb7 --- /dev/null +++ b/workflow/families_and_TAGs/modules/tags.nf @@ -0,0 +1,44 @@ +process TAG_FINDER { + + input: + path positions + path families + + output: + path 'tags.tsv' + + script: + """ + "./${baseDir}/../../rust/tagfinder/target/release/tagfinder" --positions "${positions}" --families "${families}" > 'tags.tsv' + """ +} + + +process GENE_POSITION_TABLE { + + input: + path gff3 + + output: + path 'gene_positions.tsv' + + script: + """ + awk -f "${baseDir}/scripts/gff3_to_gene_positions_table.awk" "${gff3}" > 'gene_positions.tsv' + """ +} + + +workflow TAGs { + + take: + gff3 + families + + main: + GENE_POSITION_TABLE(gff3) + TAG_FINDER(GENE_POSITION_TABLE.out, families) + + emit: + TAG_FINDER.out +} diff --git a/workflow/scripts/filter_blastp.awk b/workflow/families_and_TAGs/scripts/filter_blastp.awk similarity index 100% rename from workflow/scripts/filter_blastp.awk rename to workflow/families_and_TAGs/scripts/filter_blastp.awk diff --git a/workflow/scripts/filter_blastp_sequence_id.awk b/workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk similarity index 100% rename from workflow/scripts/filter_blastp_sequence_id.awk rename to workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk diff --git a/workflow/scripts/filter_longest.awk b/workflow/families_and_TAGs/scripts/filter_longest.awk similarity index 100% rename from workflow/scripts/filter_longest.awk rename to workflow/families_and_TAGs/scripts/filter_longest.awk diff --git a/workflow/scripts/filter_records_fasta.awk b/workflow/families_and_TAGs/scripts/filter_records_fasta.awk similarity index 100% rename from workflow/scripts/filter_records_fasta.awk rename to workflow/families_and_TAGs/scripts/filter_records_fasta.awk diff --git a/workflow/families_and_TAGs/scripts/gff3_to_gene_positions_table.awk b/workflow/families_and_TAGs/scripts/gff3_to_gene_positions_table.awk new file mode 100644 index 0000000..94c6e9f --- /dev/null +++ b/workflow/families_and_TAGs/scripts/gff3_to_gene_positions_table.awk @@ -0,0 +1,22 @@ +#!/usr/bin/env -S awk -f +# Convert a standard GFF 3 file +# into a custom TSV file +# to be used in detect TAGs step +# Usage: gff3_to_gene_positions_table.awk input.gff + +BEGIN { + OFS="\t" + selected["gene"] = 1 +} + +/^[^#]/ && selected[$3] { + chromosome=$1 + info=$9 + split(info, infoarr, ";") + geneid=infoarr[1] + gsub("ID=", "", geneid) + gsub("gene:", "", geneid) # in Ensembl GFF. + start=$4 + end=$5 + print geneid, chromosome, start, end +} diff --git a/workflow/scripts/keep_heaviest_edge_abc.awk b/workflow/families_and_TAGs/scripts/keep_heaviest_edge_abc.awk similarity index 100% rename from workflow/scripts/keep_heaviest_edge_abc.awk rename to workflow/families_and_TAGs/scripts/keep_heaviest_edge_abc.awk diff --git a/workflow/scripts/mcl_to_tsv.awk b/workflow/families_and_TAGs/scripts/mcl_to_tsv.awk similarity index 66% rename from workflow/scripts/mcl_to_tsv.awk rename to workflow/families_and_TAGs/scripts/mcl_to_tsv.awk index d981c34..e24afea 100644 --- a/workflow/scripts/mcl_to_tsv.awk +++ b/workflow/families_and_TAGs/scripts/mcl_to_tsv.awk @@ -10,7 +10,9 @@ BEGIN { { family_identifier++ - for (i=1; i <= NF; i++) { - print $i, family_identifier + if (NF > 1) { + for (i=1; i <= NF; i++) { + print $i, family_identifier + } } } diff --git a/workflow/scripts/protein_lengths.awk b/workflow/families_and_TAGs/scripts/protein_lengths.awk similarity index 100% rename from workflow/scripts/protein_lengths.awk rename to workflow/families_and_TAGs/scripts/protein_lengths.awk diff --git a/workflow/scripts/remove_supercontigs.awk b/workflow/families_and_TAGs/scripts/remove_supercontigs.awk similarity index 100% rename from workflow/scripts/remove_supercontigs.awk rename to workflow/families_and_TAGs/scripts/remove_supercontigs.awk diff --git a/workflow/main.nf b/workflow/main.nf deleted file mode 100644 index d556efc..0000000 --- a/workflow/main.nf +++ /dev/null @@ -1,33 +0,0 @@ -/** -/** Comparative Genomics workflow -/** -/** This workflow find the duplicate genes from a proteome -/** Then, It finds the Tandemly Arrayed Genes (TAGs) -/**/ - -nextflow.enable.dsl = 2; - -include { GUNZIP } from "./modules/gunzip.nf" -include { BLAST_ALL_AGAINST_ALL } from "./modules/blast.nf" -include { FILTER_FASTA } from "./modules/filter_fasta.nf" -include { FILTER_BLASTP } from "./modules/filter_blastp.nf" -include { CLUSTERING } from "./modules/clustering.nf" - -process PROTEIN_GENE_MAPPING { - - input: - path proteome - - output: - path 'protein_gene.tsv' -} - -workflow { - proteome = Channel.fromPath(params.proteome) - GUNZIP(proteome) - FILTER_FASTA(GUNZIP.out) - BLAST_ALL_AGAINST_ALL(FILTER_FASTA.out.proteome) - FILTER_BLASTP(params.min_coverage, params.min_identity, BLAST_ALL_AGAINST_ALL.out, FILTER_FASTA.out.lengths) - - CLUSTERING(FILTER_BLASTP.out) -} diff --git a/workflow/modules/filter_blastp.nf b/workflow/modules/filter_blastp.nf deleted file mode 100644 index 4a54336..0000000 --- a/workflow/modules/filter_blastp.nf +++ /dev/null @@ -1,29 +0,0 @@ -/** Filter blastp output based on coverage and identity percentage -/**/ - - -process FILTER_BLASTP { - - input: - val min_coverage - val min_identity - path blastp - path protein_length - path protein_gene - - output: - path 'filtered_blastp.tsv' - - script: - """ - sort -k 1 "${blastp}" > blastp_s - sort -k 1 "${protein_length}" > protein_length_s - join -1 1 -2 1 -t'\t' blastp_s protein_length_s > join1.tsv - sort -k 2 join1.tsv > join1.tsv_s - join -1 2 -2 1 -t'\t' join1.tsv_s' 'protein_length_s' > 'joined.blastp.tsv' - awk -f "${baseDir}/scripts/filter_blastp.awk" \ - -v coverage="${min_coverage}" \ - -v identity="${min_identity}" \ - "${blastp}" > 'filtered_blastp.tsv' - """ -} diff --git a/workflow/nextflow.config b/workflow/nextflow.config deleted file mode 100644 index eb67cce..0000000 --- a/workflow/nextflow.config +++ /dev/null @@ -1,21 +0,0 @@ -params { - proteome = "${baseDir}/../../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz" - species = "Glycine_max" - results = "results" -} - -profiles { - - conda { - conda.enabled = true - - process { - withLabel: blast { - conda = "$baseDir/conda/blast.yml" - } - withLabel: mcl { - conda = "$baseDir/conda/mcl.yml" - } - } - } -}