commit 0540e3699eb87b159c0a1072da538f5101a861cd Author: Samuel Ortion Date: Mon Oct 28 10:46:05 2024 +0100 Initial Nextflow workflow diff --git a/workflow/conda/blast.yml b/workflow/conda/blast.yml new file mode 100644 index 0000000..2304cfd --- /dev/null +++ b/workflow/conda/blast.yml @@ -0,0 +1,5 @@ +name: blast +channels: + - bioconda +dependencies: + - blast=2.5 diff --git a/workflow/conda/mcl.yml b/workflow/conda/mcl.yml new file mode 100644 index 0000000..203d92e --- /dev/null +++ b/workflow/conda/mcl.yml @@ -0,0 +1,5 @@ +name: mcl +channels: + - bioconda +dependencies: + - bioconda::mcl=22.282 diff --git a/workflow/main.nf b/workflow/main.nf new file mode 100644 index 0000000..8e5af86 --- /dev/null +++ b/workflow/main.nf @@ -0,0 +1,25 @@ +/** +/** Comparative Genomics workflow +/** +/** This workflow find the duplicate genes from a proteome +/** Then, It finds the Tandemly Arrayed Genes (TAGs) +/**/ + +nextflow.enable.dsl = 2; + +include { GUNZIP } from "./modules/gunzip.nf" +include { BLAST_MAKEBLASTDB } from "./modules/blast.nf" +include { BLAST_BLASTP } from "./modules/blast.nf" +include { FILTER_FASTA } from "./modules/filter_fasta.nf" +include { FILTER_BLASTP } from "./modules/filter_blastp.nf" +include { CLUSTERING } from "./modules/clustering.nf" + +workflow { + proteome = Channel.fromPath(params.proteome) + GUNZIP(proteome) + FILTER_FASTA(GUNZIP.out) + BLAST_MAKEBLASTDB(params.species, FILTER_FASTA.out.proteome) + BLAST_BLASTP(params.species, FILTER_FASTA.out.proteome, BLAST_MAKEBLASTDB.out) + FILTER_BLASTP(BLAST_BLASTP.out, FILTER_FASTA.out.protein_length) + CLUSTERING(FILTER_BLASTP.out) +} diff --git a/workflow/modules/blast.nf b/workflow/modules/blast.nf new file mode 100644 index 0000000..aa095b8 --- /dev/null +++ b/workflow/modules/blast.nf @@ -0,0 +1,39 @@ +/** blastp all against all */ + +process BLAST_MAKEBLASTDB { + + label 'blast' + + input: + val species + path proteome + + output: + path 'db.phr' + path 'db.pin' + path 'db.psq' + + script: + """ + makeblastdb -in "${proteome}" -dbtype prot -out 'db' -title "${species}" + """ +} + +process BLAST_BLASTP { + + label 'blast' + + input: + val species + path proteome + path 'db.phr' + path 'db.pin' + path 'db.psq' + output: + path "${species}.all-against-all.blastp.tsv" + + script: + """ + blastp -query "${proteome}" -db 'db' -outfmt '6' -out "${species}.all-against-all.blastp.tsv" + """ +} diff --git a/workflow/modules/clustering.nf b/workflow/modules/clustering.nf new file mode 100644 index 0000000..431760e --- /dev/null +++ b/workflow/modules/clustering.nf @@ -0,0 +1,54 @@ +process BLASTP_TO_ABC { + + input: + path blastp + + output: + path 'graph.abc' + + script: + """ + awk 'BEGIN { OFS="\t" } { print \$14, \$16, \$12 }' "${blastp}" > 'graph.abc' + """ +} + +process MCL { + + input: + path abc + + output: + path 'clustering.mcl' + + script: + """ + mcl "${abc}" --abc -o 'custering.mcl' + """ +} + +process MCL_TO_TSV { + + input: + path mcl + + output: + path 'families.tsv' + + script: + """ + awk -f "${baseDir}/scripts/mcl_to_tsv.awk" "${mcl}" > 'families.tsv' + """ +} + +workflow CLUSTERING { + take: + blastp_tsv + + main: + BLASTP_TO_ABC(blastp_tsv) + MCL(BLASTP_TO_ABC).out + MCL_TO_TSV(MCL.out) + + emit: + families = MCL_TO_TSV.out +} diff --git a/workflow/modules/filter_blastp.awk b/workflow/modules/filter_blastp.awk new file mode 100644 index 0000000..e69de29 diff --git a/workflow/modules/filter_blastp.nf b/workflow/modules/filter_blastp.nf new file mode 100644 index 0000000..125f913 --- /dev/null +++ b/workflow/modules/filter_blastp.nf @@ -0,0 +1,25 @@ +/** Filter blastp output based on coverage and identity percentage +/**/ + +process FILTER_BLASTP { + + input: + val min_coverage + val min_identity + path blastp + path protein_length + + output: + path 'filtered_blastp.tsv' + + script: + """ + sort "${blastp}" > blastp_s + sort "${protein_length}" > protein_length_s + join -1 1 -2 1 "blastp_s" "protein_length_s" > join1.tsv + sort -k 2 "join1.tsv" > join1.tsv_s + join -1 2 -2 1 "join1.tsv_s" "protein_length_s" > 'joined.blastp.tsv' + awk -f "${baseDir}/scripts/filter_blastp.awk" -v coverage="${min_coverage}" -v identity "${min_identity}" \ + "${blastp}" > 'filtered_blastp.tsv' + """ +} diff --git a/workflow/modules/filter_fasta.nf b/workflow/modules/filter_fasta.nf new file mode 100644 index 0000000..aada3b5 --- /dev/null +++ b/workflow/modules/filter_fasta.nf @@ -0,0 +1,57 @@ +/** 1. Filter out proteome sequence belonging to 'supercontig' */ +/** 2. Filter proteome to keep only the longest isoform */ + +process FILTER_SUPERCONTIG { + input: + path proteome + + output: + path 'proteome_nosupercontig.fasta' + + script: + """ + awk -f "${baseDir}/scripts/remove_supercontigs.awk" "${proteome}" > 'proteome_nosupercontig.fasta' + """ +} + +process SEQUENCE_LENGTH { + input: + path proteome + + output: + path 'protein_lengths.tsv' + + script: + """ + awk -f "${baseDir}/scripts/protein_lengths.awk" "${proteome}" > 'protein_lengths.tsv' + """ +} + +process PROTEOME_LONGEST { + input: + path proteome + path protein_lengths + + output: + path 'proteome_filtered.fa' + + script: + """ + awk -f "${baseDir}/scripts/filter_longest.awk" "${protein_lengths}" "${proteome}" > 'records_filtered.list' + awk -f "${baseDir}/scripts/filter_records_fasta.awk" 'records_filtered.list' "${proteome}" > 'proteome_filtered.fa' + """ +} + +workflow FILTER_FASTA { + take: + proteome + + main: + FILTER_SUPERCONTIG(proteome) + SEQUENCE_LENGTH(FILTER_SUPERCONTIG.out) + PROTEOME_LONGEST(FILTER_SUPERCONTIG.out, SEQUENCE_LENGTH.out) + + emit: + lengths = SEQUENCE_LENGTH.out + proteome = PROTEOME_LONGEST.out +} diff --git a/workflow/modules/gunzip.nf b/workflow/modules/gunzip.nf new file mode 100644 index 0000000..2c68cf6 --- /dev/null +++ b/workflow/modules/gunzip.nf @@ -0,0 +1,12 @@ +process GUNZIP { + input: + path zipped_file + + output: + path zipped_file.baseName + + script: + """ + gunzip -c "${zipped_file}" > "${zipped_file.baseName}" + """ +} diff --git a/workflow/nextflow.config b/workflow/nextflow.config new file mode 100644 index 0000000..eb67cce --- /dev/null +++ b/workflow/nextflow.config @@ -0,0 +1,21 @@ +params { + proteome = "${baseDir}/../../data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz" + species = "Glycine_max" + results = "results" +} + +profiles { + + conda { + conda.enabled = true + + process { + withLabel: blast { + conda = "$baseDir/conda/blast.yml" + } + withLabel: mcl { + conda = "$baseDir/conda/mcl.yml" + } + } + } +} diff --git a/workflow/scripts/filter_blastp.awk b/workflow/scripts/filter_blastp.awk new file mode 100644 index 0000000..ad03f75 --- /dev/null +++ b/workflow/scripts/filter_blastp.awk @@ -0,0 +1,35 @@ +#!/usr/bin/env -S awk -f +# Usage: $0 -v min_identity=30 \ +# -v min_coverage=75 \ +# ./file.blastp.tsv +# The blastp.tsv file should have +# the four following supplementary colums +# query length, +# query gene ID, +# subject length, +# subject gene ID + +BEGIN { + OFS="\t" +} + +function get_coverage(sequence_length, sequence_start, sequence_end) { + return (sequence_end - sequence_start + 1) / sequence_length * 100 +} + +{ + qgeneid=$14 + sgeneid=$16 + qstart=$7 + qend=$8 + sstart=$9 + send=$10 + qlength=$13 + slength=$15 + identity=$3 + evalue=$11 + bitscore=$12 + if (get_coverage(qlength, qstart, qend) > min_coverage && get_coverage(slength, sstart, send) > min_coverage && identity > min_identity) { + print $0 + } +} diff --git a/workflow/scripts/filter_longest.awk b/workflow/scripts/filter_longest.awk new file mode 100644 index 0000000..dc284f3 --- /dev/null +++ b/workflow/scripts/filter_longest.awk @@ -0,0 +1,34 @@ +#!/usr/bin/env -S awk -f +# Extract the FASTA record id with the longest sequence. +# Takes a two columns file with record id, record length +# and the FASTA file. +# +# Usage: filter_longest.awk protein_lengths.tsv proteins.fasta + +NR==FNR { + gsub(">", "", $1) + protein_length[$1] = $2 + next +} + +{ + gene_id = $4 + gsub("gene:", "", gene_id) + isoform_id = $0 + gsub(">", "", isoform_id) + if (gene_id in max_length) { + if (protein_length[isoform_id] > max_length[gene_id]) { + max_length[gene_id] = protein_length[isoform_id] + max_length_isoform[gene_id] = isoform_id + } + } else { + max_length[gene_id] = protein_length[isoform_id] + max_length_isoform[gene_id] = isoform_id + } +} + +END { + for (gene_id in max_length_isoform) { + print max_length_isoform[gene_id] + } +} diff --git a/workflow/scripts/filter_records_fasta.awk b/workflow/scripts/filter_records_fasta.awk new file mode 100644 index 0000000..a0590c9 --- /dev/null +++ b/workflow/scripts/filter_records_fasta.awk @@ -0,0 +1,32 @@ +#!/usr/bin/env -S awk -f +# Filter FASTA file +# to keep only the records +# with ID in records.list +# +# Usage: +# awk -f filter_record_fasta.awk \ +# records.list records.fasta + +BEGIN { + on_good_record = 0 +} + +NR == FNR { + sequence_id = $1 + kept[sequence_id] = 1 + next +} + +/^>/ { + on_good_record = 0 + sequence_id = $1 + gsub(">", "", sequence_id) + if (sequence_id in kept) { + print $0 + on_good_record = 1 + } +} + +/^[^>]/ && on_good_record { + print $0 +} diff --git a/workflow/scripts/mcl_to_tsv.awk b/workflow/scripts/mcl_to_tsv.awk new file mode 100644 index 0000000..861843a --- /dev/null +++ b/workflow/scripts/mcl_to_tsv.awk @@ -0,0 +1,16 @@ +#!/usr/bin/env -S awk -f +# Convert mcl output into tsv output +# with two columns 'gene id, family identifier' + +BEGIN { + family_identifier=0 + OFS="\t" +} + +{ + family_identifier++ + split($0, gene_list, " ") + for (gene in gene_list) { + print gene, family_identifier + } +} diff --git a/workflow/scripts/protein_lengths.awk b/workflow/scripts/protein_lengths.awk new file mode 100644 index 0000000..6db9d5a --- /dev/null +++ b/workflow/scripts/protein_lengths.awk @@ -0,0 +1,27 @@ +#!/usr/bin/env -S awk -f +# Associate a isoform id to the length of the sequence + +BEGIN { + sequence_length=0 + isoform_id="" +} + +/^>/ { + if (isoform_id != "") { + print isoform_id, sequence_length + } else { + isoform_id = $1 + gsub(">", "", isoform_id) + sequence_length = 0 + } +} + +/^[^>]/ { + sequence_length += length($0) +} + +END { + if (isoform_id != "") { + print isoform_id, sequence_length + } +} diff --git a/workflow/scripts/remove_supercontigs.awk b/workflow/scripts/remove_supercontigs.awk new file mode 100644 index 0000000..04474c2 --- /dev/null +++ b/workflow/scripts/remove_supercontigs.awk @@ -0,0 +1,25 @@ +#!/usr/bin/env -S awk -f +# Filter out 'supercontigs' records +# from an Ensembl proteome fasta file +# Usage: +# awk -f remove_supercontigs.awk \ +# "proteome.fasta" \ +# > "proteome_nosupercontigs.fasta" + +BEGIN { + on_good_sequence = 0 +} + +/^>/ { + on_good_sequence = 0 + locus = $3 + split(locus, locus_array, ":") + if (locus_array[1] != "supercontig") { + on_good_sequence = 1 + print $0 + } +} + +/^[^>]/ && on_good_sequence { + print $0 +}