Update the families and TAGs pipeline

This commit is contained in:
Samuel Ortion 2025-01-08 13:15:15 +01:00
parent 444933efa2
commit ee981b5a26
Signed by: sortion
GPG Key ID: 9B02406F8C4FB765
8 changed files with 56 additions and 11 deletions

View File

@ -0,0 +1,12 @@
Species,Genes,Duplicates,Singletons,Families,LargestFamily,TAG0,TAG1,TAG0 Genes,TAG1 Genes,Largest TAG0
Arabidopsis lyrata,32667,26320,6348,5012,494,1571,1834,3868,4739,14
Glycine max,55897,50258,5640,8421,437,2626,2922,6692,7890,32
Gossypium raimondii,38208,32975,5234,5519,333,1964,2163,5202,6155,23
Malus domestica Golden,40624,34948,5677,6912,858,2340,2707,5643,6988,15
Musa acuminata,35012,28752,6261,4698,624,949,1069,2345,2745,21
Oryza sativa,35775,23377,12399,4605,787,1440,1835,3544,4906,19
Prunus persica,26873,20222,6652,3653,292,1758,1949,4962,5928,22
Solanum tuberosum,39021,31477,7545,4465,1044,2558,2891,6488,7903,16
Theobroma cacao,29188,21051,8138,3614,606,1593,1836,4041,5069,22
Vigna angularis,33860,26954,6907,4608,649,1622,1939,3820,4771,26
Vigna radiata,22368,17021,5348,3556,411,599,728,1332,1705,7
1 Species Genes Duplicates Singletons Families LargestFamily TAG0 TAG1 TAG0 Genes TAG1 Genes Largest TAG0
2 Arabidopsis lyrata 32667 26320 6348 5012 494 1571 1834 3868 4739 14
3 Glycine max 55897 50258 5640 8421 437 2626 2922 6692 7890 32
4 Gossypium raimondii 38208 32975 5234 5519 333 1964 2163 5202 6155 23
5 Malus domestica Golden 40624 34948 5677 6912 858 2340 2707 5643 6988 15
6 Musa acuminata 35012 28752 6261 4698 624 949 1069 2345 2745 21
7 Oryza sativa 35775 23377 12399 4605 787 1440 1835 3544 4906 19
8 Prunus persica 26873 20222 6652 3653 292 1758 1949 4962 5928 22
9 Solanum tuberosum 39021 31477 7545 4465 1044 2558 2891 6488 7903 16
10 Theobroma cacao 29188 21051 8138 3614 606 1593 1836 4041 5069 22
11 Vigna angularis 33860 26954 6907 4608 649 1622 1939 3820 4771 26
12 Vigna radiata 22368 17021 5348 3556 411 599 728 1332 1705 7

View File

@ -1,4 +1,3 @@
species,proteome,gff3,blastp
Glycine max,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.60.chr.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max_Blastp_longIsoforme Glycine max,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.60.chr.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max_Blastp_longIsoforme
Gossypium raimondii,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii.Graimondii2_0_v6.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii.Graimondii2_0_v6.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii_Blastp_longIsoforme Gossypium raimondii,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii.Graimondii2_0_v6.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii.Graimondii2_0_v6.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii_Blastp_longIsoforme
Malus domestica Golden,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden.ASM211411v1.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden.ASM211411v1.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden_Blastp_longIsoforme2 Malus domestica Golden,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden.ASM211411v1.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden.ASM211411v1.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden_Blastp_longIsoforme2
@ -7,4 +6,6 @@ Oryza sativa,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics
Prunus persica,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica.Prunus_persica_NCBIv2.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica.Prunus_persica_NCBIv2.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica_Blastp_longIsoforme Prunus persica,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica.Prunus_persica_NCBIv2.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica.Prunus_persica_NCBIv2.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica_Blastp_longIsoforme
Solanum tuberosum,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum.SolTub_3.0.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum.SolTub_3.0.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum_Blastp_longIsoforme Solanum tuberosum,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum.SolTub_3.0.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum.SolTub_3.0.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum_Blastp_longIsoforme
Vigna radiata,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata.Vradiata_ver6.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata.Vradiata_ver6.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata_Blastp_longIsoforme Vigna radiata,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata.Vradiata_ver6.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata.Vradiata_ver6.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata_Blastp_longIsoforme
Vigna angularis,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_angularis.Vigan1.1.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_angularis.Vigan1.1.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_angularis_Blastp_longIsoforme
Theobroma cacao,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao.Theobroma_cacao_20110822.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao.Theobroma_cacao_20110822.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao_Blastp_longIsoforme Theobroma cacao,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao.Theobroma_cacao_20110822.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao.Theobroma_cacao_20110822.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao_Blastp_longIsoforme
Arabidopsis lyrata,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Arabidopsis_lyrata.v.1.0.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Arabidopsis_lyrata.v.1.0.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Arabidopsis_lyrata_Blastp_longIsoforme

1 species Glycine max proteome /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz gff3 /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.60.chr.gff3.gz blastp /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max_Blastp_longIsoforme
species proteome gff3 blastp
1 Glycine max Glycine max /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.60.chr.gff3.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.60.chr.gff3.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max_Blastp_longIsoforme /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max_Blastp_longIsoforme
2 Gossypium raimondii Gossypium raimondii /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii.Graimondii2_0_v6.pep.all.fa.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii.Graimondii2_0_v6.pep.all.fa.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii.Graimondii2_0_v6.60.gff3.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii.Graimondii2_0_v6.60.gff3.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii_Blastp_longIsoforme /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii_Blastp_longIsoforme
3 Malus domestica Golden Malus domestica Golden /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden.ASM211411v1.pep.all.fa.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden.ASM211411v1.pep.all.fa.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden.ASM211411v1.60.gff3.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden.ASM211411v1.60.gff3.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden_Blastp_longIsoforme2 /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden_Blastp_longIsoforme2
6 Prunus persica Prunus persica /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica.Prunus_persica_NCBIv2.pep.all.fa.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica.Prunus_persica_NCBIv2.pep.all.fa.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica.Prunus_persica_NCBIv2.60.gff3.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica.Prunus_persica_NCBIv2.60.gff3.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica_Blastp_longIsoforme /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica_Blastp_longIsoforme
7 Solanum tuberosum Solanum tuberosum /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum.SolTub_3.0.pep.all.fa.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum.SolTub_3.0.pep.all.fa.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum.SolTub_3.0.60.gff3.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum.SolTub_3.0.60.gff3.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum_Blastp_longIsoforme /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum_Blastp_longIsoforme
8 Vigna radiata Vigna radiata /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata.Vradiata_ver6.pep.all.fa.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata.Vradiata_ver6.pep.all.fa.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata.Vradiata_ver6.60.gff3.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata.Vradiata_ver6.60.gff3.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata_Blastp_longIsoforme /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata_Blastp_longIsoforme
9 Vigna angularis /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_angularis.Vigan1.1.pep.all.fa.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_angularis.Vigan1.1.60.gff3.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_angularis_Blastp_longIsoforme
10 Theobroma cacao Theobroma cacao /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao.Theobroma_cacao_20110822.pep.all.fa.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao.Theobroma_cacao_20110822.pep.all.fa.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao.Theobroma_cacao_20110822.60.gff3.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao.Theobroma_cacao_20110822.60.gff3.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao_Blastp_longIsoforme /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao_Blastp_longIsoforme
11 Arabidopsis lyrata /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Arabidopsis_lyrata.v.1.0.pep.all.fa.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Arabidopsis_lyrata.v.1.0.60.gff3.gz /media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Arabidopsis_lyrata_Blastp_longIsoforme

View File

@ -19,7 +19,7 @@ workflow {
GUNZIP_2(gff3) GUNZIP_2(gff3)
FILTER_FASTA(GUNZIP_1.out) FILTER_FASTA(GUNZIP_1.out)
// BLAST_ALL_AGAINST_ALL(FILTER_FASTA.out.proteome) // BLAST_ALL_AGAINST_ALL(FILTER_FASTA.out.proteome)
// FILTER_BLASTP(params.min_coverage, params.min_identity, BLAST_ALL_AGAINST_ALL.out, FILTER_FASTA.out.lengths) // FILTER_BLASTP(params.min_coverage, params.min_identity, BLAST_ALL_AGAINST_ALL.out, GUNZIP_1.out, FILTER_FASTA.out.lengths)
// CLUSTERING(FILTER_BLASTP.out) // CLUSTERING(FILTER_BLASTP.out)

View File

@ -66,8 +66,8 @@ process FILTER_COVERAGE_IDENTITY_BLASTP {
script: script:
""" """
awk -f "${baseDir}/scripts/filter_blastp.awk" \ awk -f "${baseDir}/scripts/filter_blastp.awk" \
-v coverage="${min_coverage}" \ -v min_coverage="${min_coverage}" \
-v identity="${min_identity}" \ -v min_identity="${min_identity}" \
"${blastp}" > 'filtered.blastp.tsv' "${blastp}" > 'filtered.blastp.tsv'
""" """
} }

View File

@ -9,7 +9,7 @@ process TAG_FINDER {
script: script:
""" """
"${baseDir}/../../rust/tagfinder/target/release/tagfinder" --positions "${positions}" --families "${families}" --definitions 0 > 'tags.tsv' "${baseDir}/../../rust/tagfinder/target/release/tagfinder" --positions "${positions}" --families "${families}" --definitions 0,1 > 'tags.tsv'
""" """
} }

View File

@ -0,0 +1,25 @@
params {
gff3 = "/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.60.chr.gff3.gz"
proteome = "/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz"
blastp = "/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max_Blastp_longIsoforme"
species = "Glycine_max"
results = "results"
min_coverage = 30
min_identity = 30
}
profiles {
conda {
conda.enabled = true
process {
withLabel: 'mcl' {
conda = "conda/mcl.yml"
}
withLabel: 'blast' {
conda = "conda/blast.yml"
}
}
}
}

View File

@ -1,10 +1,12 @@
#!/usr/bin/env bash #!/usr/bin/env bash
set -euo pipefail set -euo pipefail
tail -n +2 input.csv | while IFS= read -r line; do while IFS= read -r line; do
species="$(echo $line | cut -d ',' -f 1)" species="$(echo $line | cut -d ',' -f 1)"
proteome="$(echo $line | cut -d ',' -f 2)" proteome="$(echo $line | cut -d ',' -f 2)"
gff3="$(echo $line | cut -d ',' -f 3)" gff3="$(echo $line | cut -d ',' -f 3)"
blastp="$(echo $line | cut -d ',' -f 4)" blastp="$(echo $line | cut -d ',' -f 4)"
echo Launching for $species echo Launching for $species
nextflow run run_one.nf -resume -profile conda --species "$species" --proteome "$proteome" --gff3 "$gff3" --blastp "$blastp" nextflow run run_one.nf -resume -profile conda --species "$species" --proteome "$proteome" --gff3 "$gff3" --blastp "$blastp"
done done < "input.csv"
echo "Species,Genes,Duplicates,Singletons,Families,LargestFamily,TAG0,TAG1,TAG0 Genes,TAG1 Genes,Largest TAG0" > concat.csv
cat results/*.csv >> concat.csv

View File

@ -20,12 +20,17 @@ process CSV_ROW {
path "${species}.csv" path "${species}.csv"
script: script:
""" """
nb_tag=\$(tail -n +2 "${tags}"| awk 'BEGIN { max = 0 } { if (\$3 != "-" && \$3 > max) max = \$3 } END { print max }') nb_tag0=\$(tail -n +2 "${tags}"| awk 'BEGIN { max = 0 } { if (\$3 != "-" && \$3 > max) max = \$3 } END { print max }')
nb_tag1=\$(tail -n +2 "${tags}"| awk 'BEGIN { max = 0 } { if (\$4 != "-" && \$4 > max) max = \$4 } END { print max }')
nb_tag_genes0=\$(tail -n +2 "${tags}" | awk 'BEGIN {count = 0} \$3 != "-" { count += 1 } END { print count }' )
nb_tag_genes1=\$(tail -n +2 "${tags}" | awk 'BEGIN {count = 0} \$4 != "-" { count += 1 } END { print count }' )
nb_families=\$(tail -1 "${families}" | awk '{ print \$2 }') nb_families=\$(tail -1 "${families}" | awk '{ print \$2 }')
nb_genes_largest_family=\$(awk '\$2 != "-" { count[\$2] += 1} END { max_count=0; for (tag in count) { if (count[tag] > max_count) max_count = count[tag] } print max_count}' "${families}")
nb_genes_largest_tag0=\$(tail -n +2 "${tags}" | awk '\$3 != "-" { count[\$3] += 1} END { max_count=0; for (tag in count) { if (count[tag] > max_count) max_count = count[tag] } print max_count}')
nb_duplicates=\$(cat "${families}" | wc -l) nb_duplicates=\$(cat "${families}" | wc -l)
nb_genes=\$(awk '/^>/ { print \$4 }' "${proteome}" | sort | uniq | wc -l) nb_genes=\$(awk '/^>/ { print \$4 }' "${proteome}" | sort | uniq | wc -l)
nb_singletons=\$((nb_genes - nb_duplicates + 1)) nb_singletons=\$((nb_genes - nb_duplicates + 1))
echo "${species},\${nb_genes},\${nb_duplicates},\${nb_singletons},\${nb_families},\${nb_tag}" > "${species}.csv" echo "${species},\${nb_genes},\${nb_duplicates},\${nb_singletons},\${nb_families},\${nb_genes_largest_family},\${nb_tag0},\${nb_tag1},\${nb_tag_genes0},\${nb_tag_genes1},\${nb_genes_largest_tag0}" > "${species}.csv"
""" """
} }