diff --git a/workflow/families_and_TAGs/input.csv b/workflow/families_and_TAGs/input.csv new file mode 100644 index 0000000..d4a87b3 --- /dev/null +++ b/workflow/families_and_TAGs/input.csv @@ -0,0 +1,10 @@ +species,proteome,gff3,blastp +Glycine max,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.60.chr.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max_Blastp_longIsoforme +Gossypium raimondii,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii.Graimondii2_0_v6.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii.Graimondii2_0_v6.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii_Blastp_longIsoforme +Malus domestica Golden,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden.ASM211411v1.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden.ASM211411v1.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden_Blastp_longIsoforme2 +Musa acuminata,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Musa_acuminata.Musa_acuminata_v2.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Musa_acuminata.Musa_acuminata_v2.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Musa_acuminata_Blastp_longIsoforme +Oryza sativa,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Oryza_sativa.IRGSP-1.0.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Oryza_sativa.IRGSP-1.0.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Oryza_sativa_Blastp_longIsoforme +Prunus persica,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica.Prunus_persica_NCBIv2.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica.Prunus_persica_NCBIv2.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica_Blastp_longIsoforme +Solanum tuberosum,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum.SolTub_3.0.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum.SolTub_3.0.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum_Blastp_longIsoforme +Vigna radiata,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata.Vradiata_ver6.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata.Vradiata_ver6.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata_Blastp_longIsoforme +Theobroma cacao,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao.Theobroma_cacao_20110822.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao.Theobroma_cacao_20110822.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao_Blastp_longIsoforme \ No newline at end of file diff --git a/workflow/families_and_TAGs/modules/clustering.nf b/workflow/families_and_TAGs/modules/clustering.nf index a850ac6..54cf8e5 100644 --- a/workflow/families_and_TAGs/modules/clustering.nf +++ b/workflow/families_and_TAGs/modules/clustering.nf @@ -8,7 +8,7 @@ process BLASTP_TO_ABC { script: """ - awk 'BEGIN { FS = OFS="\t" } { print \$14, \$16, \$12 }' "${blastp}" > 'graph.abc' + awk 'BEGIN { FS=OFS="\t" } { print \$14, \$16, \$12 }' "${blastp}" > 'graph.abc' """ } @@ -36,7 +36,7 @@ process MCL { script: """ - mcl "${abc}" --abc -o 'custering.mcl' + mcl "${abc}" --abc -o 'clustering.mcl' """ } diff --git a/workflow/families_and_TAGs/modules/filter_blastp.awk b/workflow/families_and_TAGs/modules/filter_blastp.awk deleted file mode 100644 index e69de29..0000000 diff --git a/workflow/families_and_TAGs/modules/filter_blastp.nf b/workflow/families_and_TAGs/modules/filter_blastp.nf index af27b95..dbd4f3b 100644 --- a/workflow/families_and_TAGs/modules/filter_blastp.nf +++ b/workflow/families_and_TAGs/modules/filter_blastp.nf @@ -1,6 +1,23 @@ /** Filter blastp output based on coverage and identity percentage /**/ +process REMOVE_SUPERCONTIGS_CHLOROPLASTS_AND_MITOCHONDRIA { + input: + path proteome + path blastp + output: + path 'blastp_no_supercontigs_chloroplast_nor_mitochondria.tsv' + script: + """ + awk -f "${baseDir}/scripts/black_gene_list.awk" "${proteome}" > 'blacklist.list' + if [ ! -s 'blacklist.list' ]; then + cp "${blastp}" 'blastp_no_supercontigs_chloroplast_nor_mitochondria.tsv' + else + awk -f "${baseDir}/scripts/filter_blastp_sequence_id_remove.awk" 'blacklist.list' "${blastp}" > 'blastp_no_supercontigs_chloroplast_nor_mitochondria.tsv' + fi + """ +} + process REMOVE_LOOPS { input: @@ -12,7 +29,7 @@ process REMOVE_LOOPS { script: """ - awk 'BEGIN { FS=OFS="\t" } \$1 != \$2 { print \$0 }' + awk 'BEGIN { FS=OFS="\t" } \$1 != \$2 { print \$0 }' "${blastp}" > 'noloops.blastp.tsv' """ } @@ -27,11 +44,11 @@ process ADD_GENE_ID_AND_PROTEIN_LENGTHS { script: """ - sort -k 1 "${blastp}" > blastp_s - sort -k 1 "${protein_length}" > protein_length_s - join -1 1 -2 1 -t \$'\t' blastp_s protein_length_s > join1.tsv - sort -k 2 join1.tsv > join1.tsv_s - join -1 2 -2 1 -t \$'\t' join1.tsv_s protein_length_s > joined.blastp.tsv + LANG=en_EN sort -t \$'\t' -k 1 "${blastp}" > blastp_s + LANG=en_EN sort -t \$'\t' -k 1 "${protein_length}" > protein_length_s + LANG=en_EN join -1 1 -2 1 -t \$'\t' blastp_s protein_length_s > join1.tsv + LANG=en_EN sort -t \$'\t' -k 2 join1.tsv > join1.tsv_s + LANG=en_EN join -1 2 -2 1 -t \$'\t' join1.tsv_s protein_length_s > joined.blastp.tsv """ } @@ -39,9 +56,9 @@ process ADD_GENE_ID_AND_PROTEIN_LENGTHS { process FILTER_COVERAGE_IDENTITY_BLASTP { input: + path blastp val min_coverage val min_identity - path blastp output: path 'filtered.blastp.tsv' @@ -58,15 +75,17 @@ process FILTER_COVERAGE_IDENTITY_BLASTP { workflow FILTER_BLASTP { take: blastp + proteome protein_length min_coverage min_identity main: - REMOVE_LOOPS(blastp) - ADD_GENE_ID_AND_PROTEIN_LENGTHS(REMOVE_LOOPS.out) - FILTER_COVERAGE_IDENTITY_BLASTP(min_identity, min_coverage, ADD_GENE_ID_AND_PROTEIN_LENGTHS.out) + REMOVE_SUPERCONTIGS_CHLOROPLASTS_AND_MITOCHONDRIA(proteome, blastp) + REMOVE_LOOPS(REMOVE_SUPERCONTIGS_CHLOROPLASTS_AND_MITOCHONDRIA.out) + ADD_GENE_ID_AND_PROTEIN_LENGTHS(REMOVE_LOOPS.out, protein_length) + FILTER_COVERAGE_IDENTITY_BLASTP(ADD_GENE_ID_AND_PROTEIN_LENGTHS.out, min_identity, min_coverage) emit: - blastp=FILTER_COVERAGE_IDENTITY_BLASTP + blastp=FILTER_COVERAGE_IDENTITY_BLASTP.out } diff --git a/workflow/families_and_TAGs/modules/filter_fasta.nf b/workflow/families_and_TAGs/modules/filter_fasta.nf index aada3b5..1c2e972 100644 --- a/workflow/families_and_TAGs/modules/filter_fasta.nf +++ b/workflow/families_and_TAGs/modules/filter_fasta.nf @@ -1,16 +1,14 @@ -/** 1. Filter out proteome sequence belonging to 'supercontig' */ +/** 1. Filter out proteome sequence belonging to 'supercontig', Mt, or Pt */ /** 2. Filter proteome to keep only the longest isoform */ -process FILTER_SUPERCONTIG { +process REMOVE_SUPERCONTIGS_CHLOROPLASTS_AND_MITOCHONDRIA { input: path proteome - output: - path 'proteome_nosupercontig.fasta' - + path 'filtered_proteome.fasta' script: """ - awk -f "${baseDir}/scripts/remove_supercontigs.awk" "${proteome}" > 'proteome_nosupercontig.fasta' + awk -f "${baseDir}/scripts/remove_supercontigs_chloroplasts_mitochondria_fasta.awk" "${proteome}" > 'filtered_proteome.fasta' """ } @@ -42,14 +40,16 @@ process PROTEOME_LONGEST { """ } + + workflow FILTER_FASTA { take: proteome main: - FILTER_SUPERCONTIG(proteome) - SEQUENCE_LENGTH(FILTER_SUPERCONTIG.out) - PROTEOME_LONGEST(FILTER_SUPERCONTIG.out, SEQUENCE_LENGTH.out) + REMOVE_SUPERCONTIGS_CHLOROPLASTS_AND_MITOCHONDRIA(proteome) + SEQUENCE_LENGTH(REMOVE_SUPERCONTIGS_CHLOROPLASTS_AND_MITOCHONDRIA.out) + PROTEOME_LONGEST(REMOVE_SUPERCONTIGS_CHLOROPLASTS_AND_MITOCHONDRIA.out, SEQUENCE_LENGTH.out) emit: lengths = SEQUENCE_LENGTH.out diff --git a/workflow/families_and_TAGs/modules/tags.nf b/workflow/families_and_TAGs/modules/tags.nf index 43ebfb7..ef3a8fb 100644 --- a/workflow/families_and_TAGs/modules/tags.nf +++ b/workflow/families_and_TAGs/modules/tags.nf @@ -9,7 +9,7 @@ process TAG_FINDER { script: """ - "./${baseDir}/../../rust/tagfinder/target/release/tagfinder" --positions "${positions}" --families "${families}" > 'tags.tsv' + "${baseDir}/../../rust/tagfinder/target/release/tagfinder" --positions "${positions}" --families "${families}" --definitions 0 > 'tags.tsv' """ } @@ -32,8 +32,8 @@ process GENE_POSITION_TABLE { workflow TAGs { take: - gff3 families + gff3 main: GENE_POSITION_TABLE(gff3) diff --git a/workflow/families_and_TAGs/run_all.sh b/workflow/families_and_TAGs/run_all.sh new file mode 100644 index 0000000..e79081b --- /dev/null +++ b/workflow/families_and_TAGs/run_all.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +set -euo pipefail +tail -n +2 input.csv | while IFS= read -r line; do + species="$(echo $line | cut -d ',' -f 1)" + proteome="$(echo $line | cut -d ',' -f 2)" + gff3="$(echo $line | cut -d ',' -f 3)" + blastp="$(echo $line | cut -d ',' -f 4)" + echo Launching for $species + nextflow run run_one.nf -resume -profile conda --species "$species" --proteome "$proteome" --gff3 "$gff3" --blastp "$blastp" +done \ No newline at end of file diff --git a/workflow/families_and_TAGs/run_one.nf b/workflow/families_and_TAGs/run_one.nf new file mode 100644 index 0000000..5115543 --- /dev/null +++ b/workflow/families_and_TAGs/run_one.nf @@ -0,0 +1,46 @@ +/** Run the workflow for all species in the csv table */ + +include { GUNZIP as GUNZIP_1 } from "./modules/gunzip.nf" +include { GUNZIP as GUNZIP_2 } from "./modules/gunzip.nf" +include { FILTER_FASTA } from "./modules/filter_fasta.nf" +include { FILTER_BLASTP } from "./modules/filter_blastp.nf" +include { CLUSTERING } from "./modules/clustering.nf" +include { TAGs } from "./modules/tags.nf" + + + +process CSV_ROW { + publishDir "results/", mode: 'copy' + input: + val(species) + path(proteome) + path(tags) + path(families) + output: + path "${species}.csv" + script: + """ + nb_tag=\$(tail -n +2 "${tags}"| awk 'BEGIN { max = 0 } { if (\$3 != "-" && \$3 > max) max = \$3 } END { print max }') + nb_families=\$(tail -1 "${families}" | awk '{ print \$2 }') + nb_duplicates=\$(cat "${families}" | wc -l) + nb_genes=\$(awk '/^>/ { print \$4 }' "${proteome}" | sort | uniq | wc -l) + nb_singletons=\$((nb_genes - nb_duplicates + 1)) + echo "${species},\${nb_genes},\${nb_duplicates},\${nb_singletons},\${nb_families},\${nb_tag}" > "${species}.csv" + """ +} + + + +workflow { + proteome = file(params.proteome) + gff3 = file(params.gff3) + blastp = file(params.blastp) + + GUNZIP_1(proteome) + GUNZIP_2(gff3) + FILTER_FASTA(GUNZIP_1.out) + FILTER_BLASTP(blastp, GUNZIP_1.out, FILTER_FASTA.out.lengths, params.min_coverage, params.min_identity) + CLUSTERING(FILTER_BLASTP.out.blastp) + TAGs(CLUSTERING.out, GUNZIP_2.out) + CSV_ROW(params.species, GUNZIP_1.out, TAGs.out, CLUSTERING.out) +} \ No newline at end of file diff --git a/workflow/families_and_TAGs/scripts/black_gene_list.awk b/workflow/families_and_TAGs/scripts/black_gene_list.awk new file mode 100644 index 0000000..ebd4675 --- /dev/null +++ b/workflow/families_and_TAGs/scripts/black_gene_list.awk @@ -0,0 +1,12 @@ +#!/usr/bin/env -S awk -f + +/^>/ { + locus=$3 + split(":", arr, locus) + chromosome=arr[3] + if (chromosome == "Mt" || chromosome == "Pt" || chromosome ~ /supercontig/) { + record_id=$1 + gsub(">", "", record_id) + print record_id + } +} \ No newline at end of file diff --git a/workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk b/workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk index 7b9fc28..a2204a5 100644 --- a/workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk +++ b/workflow/families_and_TAGs/scripts/filter_blastp_sequence_id.awk @@ -9,13 +9,13 @@ NR == FNR { sequence_id = $1 - to_remove[sequence_id] = 1 + kept[sequence_id] = 1 next } { sequence_id = $1 - if (!(sequence_id in to_remove)) { + if (sequence_id in kept || $2 in kept) { print $0 } -} +} \ No newline at end of file diff --git a/workflow/families_and_TAGs/scripts/filter_blastp_sequence_id_remove.awk b/workflow/families_and_TAGs/scripts/filter_blastp_sequence_id_remove.awk new file mode 100644 index 0000000..0c1ced5 --- /dev/null +++ b/workflow/families_and_TAGs/scripts/filter_blastp_sequence_id_remove.awk @@ -0,0 +1,25 @@ +#!/usr/bin/env -S awk -f +# Filter BLASTp format 6 file +# to keep only the records +# with no ID in records.list +# +# Usage: +# awk -f filter_blastp_sequence_id.awk \ +# records.list records.blastp.tsv + +BEGIN { + FS="\t" + OFS="\t" +} + +NR==FNR { + sequence_id=$1 + kept[sequence_id]=1 + next +} + +{ + if (!($1 in kept) && !($2 in kept)) { + print $0 + } +} diff --git a/workflow/families_and_TAGs/scripts/filter_records_fasta.awk b/workflow/families_and_TAGs/scripts/filter_records_fasta.awk index a0590c9..009f17f 100644 --- a/workflow/families_and_TAGs/scripts/filter_records_fasta.awk +++ b/workflow/families_and_TAGs/scripts/filter_records_fasta.awk @@ -1,10 +1,10 @@ #!/usr/bin/env -S awk -f # Filter FASTA file -# to keep only the records +# to keep or remove only the records # with ID in records.list # # Usage: -# awk -f filter_record_fasta.awk \ +# awk -f (-v direction=keep|remove) filter_record_fasta.awk \ # records.list records.fasta BEGIN { @@ -22,8 +22,15 @@ NR == FNR { sequence_id = $1 gsub(">", "", sequence_id) if (sequence_id in kept) { - print $0 - on_good_record = 1 + if (direction == "keep") { + print $0 + on_good_record = 1 + } + } else { + if (direction == "remove") { + print $0 + on_good_record = 1 + } } } diff --git a/workflow/families_and_TAGs/scripts/remove_supercontigs.awk b/workflow/families_and_TAGs/scripts/remove_supercontigs.awk deleted file mode 100644 index 04474c2..0000000 --- a/workflow/families_and_TAGs/scripts/remove_supercontigs.awk +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env -S awk -f -# Filter out 'supercontigs' records -# from an Ensembl proteome fasta file -# Usage: -# awk -f remove_supercontigs.awk \ -# "proteome.fasta" \ -# > "proteome_nosupercontigs.fasta" - -BEGIN { - on_good_sequence = 0 -} - -/^>/ { - on_good_sequence = 0 - locus = $3 - split(locus, locus_array, ":") - if (locus_array[1] != "supercontig") { - on_good_sequence = 1 - print $0 - } -} - -/^[^>]/ && on_good_sequence { - print $0 -} diff --git a/workflow/families_and_TAGs/scripts/remove_supercontigs_chloroplasts_mitochondria_fasta.awk b/workflow/families_and_TAGs/scripts/remove_supercontigs_chloroplasts_mitochondria_fasta.awk new file mode 100644 index 0000000..9cf0c3d --- /dev/null +++ b/workflow/families_and_TAGs/scripts/remove_supercontigs_chloroplasts_mitochondria_fasta.awk @@ -0,0 +1,21 @@ +#!/usr/bin/env -S awk -f + +BEGIN { + on_good_sequence = 0 +} + +/^>/ { + locus=$3 + split(locus, arr, ":") + chromosome=arr[3] + if (chromosome == "Mt" || chromosome == "Pt" || chromosome ~ /supercontig/) { + on_good_sequence = 0 + } else { + on_good_sequence = 1 + print $0 + } +} + +/^[^>]/ && on_good_sequence == 1 { + print $0 +}