From ee981b5a26621c7779d905c821e423605acecb11 Mon Sep 17 00:00:00 2001 From: Samuel Ortion Date: Wed, 8 Jan 2025 13:15:15 +0100 Subject: [PATCH] Update the families and TAGs pipeline --- workflow/families_and_TAGs/concat.csv | 12 +++++++++ workflow/families_and_TAGs/input.csv | 5 ++-- workflow/families_and_TAGs/main.nf | 2 +- .../modules/filter_blastp.nf | 4 +-- workflow/families_and_TAGs/modules/tags.nf | 2 +- workflow/families_and_TAGs/nextflow.config | 25 +++++++++++++++++++ workflow/families_and_TAGs/run_all.sh | 6 +++-- workflow/families_and_TAGs/run_one.nf | 11 +++++--- 8 files changed, 56 insertions(+), 11 deletions(-) create mode 100644 workflow/families_and_TAGs/concat.csv create mode 100644 workflow/families_and_TAGs/nextflow.config diff --git a/workflow/families_and_TAGs/concat.csv b/workflow/families_and_TAGs/concat.csv new file mode 100644 index 0000000..a6d2ef5 --- /dev/null +++ b/workflow/families_and_TAGs/concat.csv @@ -0,0 +1,12 @@ +Species,Genes,Duplicates,Singletons,Families,LargestFamily,TAG0,TAG1,TAG0 Genes,TAG1 Genes,Largest TAG0 +Arabidopsis lyrata,32667,26320,6348,5012,494,1571,1834,3868,4739,14 +Glycine max,55897,50258,5640,8421,437,2626,2922,6692,7890,32 +Gossypium raimondii,38208,32975,5234,5519,333,1964,2163,5202,6155,23 +Malus domestica Golden,40624,34948,5677,6912,858,2340,2707,5643,6988,15 +Musa acuminata,35012,28752,6261,4698,624,949,1069,2345,2745,21 +Oryza sativa,35775,23377,12399,4605,787,1440,1835,3544,4906,19 +Prunus persica,26873,20222,6652,3653,292,1758,1949,4962,5928,22 +Solanum tuberosum,39021,31477,7545,4465,1044,2558,2891,6488,7903,16 +Theobroma cacao,29188,21051,8138,3614,606,1593,1836,4041,5069,22 +Vigna angularis,33860,26954,6907,4608,649,1622,1939,3820,4771,26 +Vigna radiata,22368,17021,5348,3556,411,599,728,1332,1705,7 diff --git a/workflow/families_and_TAGs/input.csv b/workflow/families_and_TAGs/input.csv index d4a87b3..4d43070 100644 --- a/workflow/families_and_TAGs/input.csv +++ b/workflow/families_and_TAGs/input.csv @@ -1,4 +1,3 @@ -species,proteome,gff3,blastp Glycine max,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.60.chr.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max_Blastp_longIsoforme Gossypium raimondii,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii.Graimondii2_0_v6.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii.Graimondii2_0_v6.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii_Blastp_longIsoforme Malus domestica Golden,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden.ASM211411v1.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden.ASM211411v1.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden_Blastp_longIsoforme2 @@ -7,4 +6,6 @@ Oryza sativa,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics Prunus persica,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica.Prunus_persica_NCBIv2.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica.Prunus_persica_NCBIv2.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica_Blastp_longIsoforme Solanum tuberosum,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum.SolTub_3.0.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum.SolTub_3.0.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum_Blastp_longIsoforme Vigna radiata,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata.Vradiata_ver6.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata.Vradiata_ver6.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata_Blastp_longIsoforme -Theobroma cacao,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao.Theobroma_cacao_20110822.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao.Theobroma_cacao_20110822.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao_Blastp_longIsoforme \ No newline at end of file +Vigna angularis,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_angularis.Vigan1.1.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_angularis.Vigan1.1.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_angularis_Blastp_longIsoforme +Theobroma cacao,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao.Theobroma_cacao_20110822.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao.Theobroma_cacao_20110822.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao_Blastp_longIsoforme +Arabidopsis lyrata,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Arabidopsis_lyrata.v.1.0.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Arabidopsis_lyrata.v.1.0.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Arabidopsis_lyrata_Blastp_longIsoforme diff --git a/workflow/families_and_TAGs/main.nf b/workflow/families_and_TAGs/main.nf index 9b354d3..3899d72 100644 --- a/workflow/families_and_TAGs/main.nf +++ b/workflow/families_and_TAGs/main.nf @@ -19,7 +19,7 @@ workflow { GUNZIP_2(gff3) FILTER_FASTA(GUNZIP_1.out) // BLAST_ALL_AGAINST_ALL(FILTER_FASTA.out.proteome) -// FILTER_BLASTP(params.min_coverage, params.min_identity, BLAST_ALL_AGAINST_ALL.out, FILTER_FASTA.out.lengths) +// FILTER_BLASTP(params.min_coverage, params.min_identity, BLAST_ALL_AGAINST_ALL.out, GUNZIP_1.out, FILTER_FASTA.out.lengths) // CLUSTERING(FILTER_BLASTP.out) diff --git a/workflow/families_and_TAGs/modules/filter_blastp.nf b/workflow/families_and_TAGs/modules/filter_blastp.nf index dbd4f3b..2218008 100644 --- a/workflow/families_and_TAGs/modules/filter_blastp.nf +++ b/workflow/families_and_TAGs/modules/filter_blastp.nf @@ -66,8 +66,8 @@ process FILTER_COVERAGE_IDENTITY_BLASTP { script: """ awk -f "${baseDir}/scripts/filter_blastp.awk" \ - -v coverage="${min_coverage}" \ - -v identity="${min_identity}" \ + -v min_coverage="${min_coverage}" \ + -v min_identity="${min_identity}" \ "${blastp}" > 'filtered.blastp.tsv' """ } diff --git a/workflow/families_and_TAGs/modules/tags.nf b/workflow/families_and_TAGs/modules/tags.nf index ef3a8fb..84f9cba 100644 --- a/workflow/families_and_TAGs/modules/tags.nf +++ b/workflow/families_and_TAGs/modules/tags.nf @@ -9,7 +9,7 @@ process TAG_FINDER { script: """ - "${baseDir}/../../rust/tagfinder/target/release/tagfinder" --positions "${positions}" --families "${families}" --definitions 0 > 'tags.tsv' + "${baseDir}/../../rust/tagfinder/target/release/tagfinder" --positions "${positions}" --families "${families}" --definitions 0,1 > 'tags.tsv' """ } diff --git a/workflow/families_and_TAGs/nextflow.config b/workflow/families_and_TAGs/nextflow.config new file mode 100644 index 0000000..65b4ba0 --- /dev/null +++ b/workflow/families_and_TAGs/nextflow.config @@ -0,0 +1,25 @@ +params { + gff3 = "/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.60.chr.gff3.gz" + proteome = "/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz" + blastp = "/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max_Blastp_longIsoforme" + species = "Glycine_max" + results = "results" + min_coverage = 30 + min_identity = 30 +} + +profiles { + + conda { + conda.enabled = true + + process { + withLabel: 'mcl' { + conda = "conda/mcl.yml" + } + withLabel: 'blast' { + conda = "conda/blast.yml" + } + } + } +} diff --git a/workflow/families_and_TAGs/run_all.sh b/workflow/families_and_TAGs/run_all.sh index e79081b..ac949f2 100644 --- a/workflow/families_and_TAGs/run_all.sh +++ b/workflow/families_and_TAGs/run_all.sh @@ -1,10 +1,12 @@ #!/usr/bin/env bash set -euo pipefail -tail -n +2 input.csv | while IFS= read -r line; do +while IFS= read -r line; do species="$(echo $line | cut -d ',' -f 1)" proteome="$(echo $line | cut -d ',' -f 2)" gff3="$(echo $line | cut -d ',' -f 3)" blastp="$(echo $line | cut -d ',' -f 4)" echo Launching for $species nextflow run run_one.nf -resume -profile conda --species "$species" --proteome "$proteome" --gff3 "$gff3" --blastp "$blastp" -done \ No newline at end of file +done < "input.csv" +echo "Species,Genes,Duplicates,Singletons,Families,LargestFamily,TAG0,TAG1,TAG0 Genes,TAG1 Genes,Largest TAG0" > concat.csv +cat results/*.csv >> concat.csv diff --git a/workflow/families_and_TAGs/run_one.nf b/workflow/families_and_TAGs/run_one.nf index 5115543..0a5d31c 100644 --- a/workflow/families_and_TAGs/run_one.nf +++ b/workflow/families_and_TAGs/run_one.nf @@ -20,12 +20,17 @@ process CSV_ROW { path "${species}.csv" script: """ - nb_tag=\$(tail -n +2 "${tags}"| awk 'BEGIN { max = 0 } { if (\$3 != "-" && \$3 > max) max = \$3 } END { print max }') + nb_tag0=\$(tail -n +2 "${tags}"| awk 'BEGIN { max = 0 } { if (\$3 != "-" && \$3 > max) max = \$3 } END { print max }') + nb_tag1=\$(tail -n +2 "${tags}"| awk 'BEGIN { max = 0 } { if (\$4 != "-" && \$4 > max) max = \$4 } END { print max }') + nb_tag_genes0=\$(tail -n +2 "${tags}" | awk 'BEGIN {count = 0} \$3 != "-" { count += 1 } END { print count }' ) + nb_tag_genes1=\$(tail -n +2 "${tags}" | awk 'BEGIN {count = 0} \$4 != "-" { count += 1 } END { print count }' ) nb_families=\$(tail -1 "${families}" | awk '{ print \$2 }') + nb_genes_largest_family=\$(awk '\$2 != "-" { count[\$2] += 1} END { max_count=0; for (tag in count) { if (count[tag] > max_count) max_count = count[tag] } print max_count}' "${families}") + nb_genes_largest_tag0=\$(tail -n +2 "${tags}" | awk '\$3 != "-" { count[\$3] += 1} END { max_count=0; for (tag in count) { if (count[tag] > max_count) max_count = count[tag] } print max_count}') nb_duplicates=\$(cat "${families}" | wc -l) nb_genes=\$(awk '/^>/ { print \$4 }' "${proteome}" | sort | uniq | wc -l) nb_singletons=\$((nb_genes - nb_duplicates + 1)) - echo "${species},\${nb_genes},\${nb_duplicates},\${nb_singletons},\${nb_families},\${nb_tag}" > "${species}.csv" + echo "${species},\${nb_genes},\${nb_duplicates},\${nb_singletons},\${nb_families},\${nb_genes_largest_family},\${nb_tag0},\${nb_tag1},\${nb_tag_genes0},\${nb_tag_genes1},\${nb_genes_largest_tag0}" > "${species}.csv" """ } @@ -43,4 +48,4 @@ workflow { CLUSTERING(FILTER_BLASTP.out.blastp) TAGs(CLUSTERING.out, GUNZIP_2.out) CSV_ROW(params.species, GUNZIP_1.out, TAGs.out, CLUSTERING.out) -} \ No newline at end of file +}