Update the families and TAGs pipeline
This commit is contained in:
parent
444933efa2
commit
ee981b5a26
|
@ -0,0 +1,12 @@
|
||||||
|
Species,Genes,Duplicates,Singletons,Families,LargestFamily,TAG0,TAG1,TAG0 Genes,TAG1 Genes,Largest TAG0
|
||||||
|
Arabidopsis lyrata,32667,26320,6348,5012,494,1571,1834,3868,4739,14
|
||||||
|
Glycine max,55897,50258,5640,8421,437,2626,2922,6692,7890,32
|
||||||
|
Gossypium raimondii,38208,32975,5234,5519,333,1964,2163,5202,6155,23
|
||||||
|
Malus domestica Golden,40624,34948,5677,6912,858,2340,2707,5643,6988,15
|
||||||
|
Musa acuminata,35012,28752,6261,4698,624,949,1069,2345,2745,21
|
||||||
|
Oryza sativa,35775,23377,12399,4605,787,1440,1835,3544,4906,19
|
||||||
|
Prunus persica,26873,20222,6652,3653,292,1758,1949,4962,5928,22
|
||||||
|
Solanum tuberosum,39021,31477,7545,4465,1044,2558,2891,6488,7903,16
|
||||||
|
Theobroma cacao,29188,21051,8138,3614,606,1593,1836,4041,5069,22
|
||||||
|
Vigna angularis,33860,26954,6907,4608,649,1622,1939,3820,4771,26
|
||||||
|
Vigna radiata,22368,17021,5348,3556,411,599,728,1332,1705,7
|
|
|
@ -1,4 +1,3 @@
|
||||||
species,proteome,gff3,blastp
|
|
||||||
Glycine max,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.60.chr.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max_Blastp_longIsoforme
|
Glycine max,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.60.chr.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max_Blastp_longIsoforme
|
||||||
Gossypium raimondii,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii.Graimondii2_0_v6.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii.Graimondii2_0_v6.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii_Blastp_longIsoforme
|
Gossypium raimondii,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii.Graimondii2_0_v6.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii.Graimondii2_0_v6.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii_Blastp_longIsoforme
|
||||||
Malus domestica Golden,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden.ASM211411v1.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden.ASM211411v1.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden_Blastp_longIsoforme2
|
Malus domestica Golden,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden.ASM211411v1.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden.ASM211411v1.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden_Blastp_longIsoforme2
|
||||||
|
@ -7,4 +6,6 @@ Oryza sativa,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics
|
||||||
Prunus persica,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica.Prunus_persica_NCBIv2.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica.Prunus_persica_NCBIv2.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica_Blastp_longIsoforme
|
Prunus persica,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica.Prunus_persica_NCBIv2.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica.Prunus_persica_NCBIv2.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica_Blastp_longIsoforme
|
||||||
Solanum tuberosum,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum.SolTub_3.0.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum.SolTub_3.0.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum_Blastp_longIsoforme
|
Solanum tuberosum,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum.SolTub_3.0.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum.SolTub_3.0.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum_Blastp_longIsoforme
|
||||||
Vigna radiata,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata.Vradiata_ver6.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata.Vradiata_ver6.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata_Blastp_longIsoforme
|
Vigna radiata,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata.Vradiata_ver6.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata.Vradiata_ver6.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata_Blastp_longIsoforme
|
||||||
Theobroma cacao,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao.Theobroma_cacao_20110822.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao.Theobroma_cacao_20110822.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao_Blastp_longIsoforme
|
Vigna angularis,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_angularis.Vigan1.1.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_angularis.Vigan1.1.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_angularis_Blastp_longIsoforme
|
||||||
|
Theobroma cacao,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao.Theobroma_cacao_20110822.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao.Theobroma_cacao_20110822.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao_Blastp_longIsoforme
|
||||||
|
Arabidopsis lyrata,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Arabidopsis_lyrata.v.1.0.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Arabidopsis_lyrata.v.1.0.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Arabidopsis_lyrata_Blastp_longIsoforme
|
||||||
|
|
|
|
@ -19,7 +19,7 @@ workflow {
|
||||||
GUNZIP_2(gff3)
|
GUNZIP_2(gff3)
|
||||||
FILTER_FASTA(GUNZIP_1.out)
|
FILTER_FASTA(GUNZIP_1.out)
|
||||||
// BLAST_ALL_AGAINST_ALL(FILTER_FASTA.out.proteome)
|
// BLAST_ALL_AGAINST_ALL(FILTER_FASTA.out.proteome)
|
||||||
// FILTER_BLASTP(params.min_coverage, params.min_identity, BLAST_ALL_AGAINST_ALL.out, FILTER_FASTA.out.lengths)
|
// FILTER_BLASTP(params.min_coverage, params.min_identity, BLAST_ALL_AGAINST_ALL.out, GUNZIP_1.out, FILTER_FASTA.out.lengths)
|
||||||
|
|
||||||
// CLUSTERING(FILTER_BLASTP.out)
|
// CLUSTERING(FILTER_BLASTP.out)
|
||||||
|
|
||||||
|
|
|
@ -66,8 +66,8 @@ process FILTER_COVERAGE_IDENTITY_BLASTP {
|
||||||
script:
|
script:
|
||||||
"""
|
"""
|
||||||
awk -f "${baseDir}/scripts/filter_blastp.awk" \
|
awk -f "${baseDir}/scripts/filter_blastp.awk" \
|
||||||
-v coverage="${min_coverage}" \
|
-v min_coverage="${min_coverage}" \
|
||||||
-v identity="${min_identity}" \
|
-v min_identity="${min_identity}" \
|
||||||
"${blastp}" > 'filtered.blastp.tsv'
|
"${blastp}" > 'filtered.blastp.tsv'
|
||||||
"""
|
"""
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,7 +9,7 @@ process TAG_FINDER {
|
||||||
|
|
||||||
script:
|
script:
|
||||||
"""
|
"""
|
||||||
"${baseDir}/../../rust/tagfinder/target/release/tagfinder" --positions "${positions}" --families "${families}" --definitions 0 > 'tags.tsv'
|
"${baseDir}/../../rust/tagfinder/target/release/tagfinder" --positions "${positions}" --families "${families}" --definitions 0,1 > 'tags.tsv'
|
||||||
"""
|
"""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
params {
|
||||||
|
gff3 = "/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.60.chr.gff3.gz"
|
||||||
|
proteome = "/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz"
|
||||||
|
blastp = "/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max_Blastp_longIsoforme"
|
||||||
|
species = "Glycine_max"
|
||||||
|
results = "results"
|
||||||
|
min_coverage = 30
|
||||||
|
min_identity = 30
|
||||||
|
}
|
||||||
|
|
||||||
|
profiles {
|
||||||
|
|
||||||
|
conda {
|
||||||
|
conda.enabled = true
|
||||||
|
|
||||||
|
process {
|
||||||
|
withLabel: 'mcl' {
|
||||||
|
conda = "conda/mcl.yml"
|
||||||
|
}
|
||||||
|
withLabel: 'blast' {
|
||||||
|
conda = "conda/blast.yml"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,10 +1,12 @@
|
||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
tail -n +2 input.csv | while IFS= read -r line; do
|
while IFS= read -r line; do
|
||||||
species="$(echo $line | cut -d ',' -f 1)"
|
species="$(echo $line | cut -d ',' -f 1)"
|
||||||
proteome="$(echo $line | cut -d ',' -f 2)"
|
proteome="$(echo $line | cut -d ',' -f 2)"
|
||||||
gff3="$(echo $line | cut -d ',' -f 3)"
|
gff3="$(echo $line | cut -d ',' -f 3)"
|
||||||
blastp="$(echo $line | cut -d ',' -f 4)"
|
blastp="$(echo $line | cut -d ',' -f 4)"
|
||||||
echo Launching for $species
|
echo Launching for $species
|
||||||
nextflow run run_one.nf -resume -profile conda --species "$species" --proteome "$proteome" --gff3 "$gff3" --blastp "$blastp"
|
nextflow run run_one.nf -resume -profile conda --species "$species" --proteome "$proteome" --gff3 "$gff3" --blastp "$blastp"
|
||||||
done
|
done < "input.csv"
|
||||||
|
echo "Species,Genes,Duplicates,Singletons,Families,LargestFamily,TAG0,TAG1,TAG0 Genes,TAG1 Genes,Largest TAG0" > concat.csv
|
||||||
|
cat results/*.csv >> concat.csv
|
||||||
|
|
|
@ -20,12 +20,17 @@ process CSV_ROW {
|
||||||
path "${species}.csv"
|
path "${species}.csv"
|
||||||
script:
|
script:
|
||||||
"""
|
"""
|
||||||
nb_tag=\$(tail -n +2 "${tags}"| awk 'BEGIN { max = 0 } { if (\$3 != "-" && \$3 > max) max = \$3 } END { print max }')
|
nb_tag0=\$(tail -n +2 "${tags}"| awk 'BEGIN { max = 0 } { if (\$3 != "-" && \$3 > max) max = \$3 } END { print max }')
|
||||||
|
nb_tag1=\$(tail -n +2 "${tags}"| awk 'BEGIN { max = 0 } { if (\$4 != "-" && \$4 > max) max = \$4 } END { print max }')
|
||||||
|
nb_tag_genes0=\$(tail -n +2 "${tags}" | awk 'BEGIN {count = 0} \$3 != "-" { count += 1 } END { print count }' )
|
||||||
|
nb_tag_genes1=\$(tail -n +2 "${tags}" | awk 'BEGIN {count = 0} \$4 != "-" { count += 1 } END { print count }' )
|
||||||
nb_families=\$(tail -1 "${families}" | awk '{ print \$2 }')
|
nb_families=\$(tail -1 "${families}" | awk '{ print \$2 }')
|
||||||
|
nb_genes_largest_family=\$(awk '\$2 != "-" { count[\$2] += 1} END { max_count=0; for (tag in count) { if (count[tag] > max_count) max_count = count[tag] } print max_count}' "${families}")
|
||||||
|
nb_genes_largest_tag0=\$(tail -n +2 "${tags}" | awk '\$3 != "-" { count[\$3] += 1} END { max_count=0; for (tag in count) { if (count[tag] > max_count) max_count = count[tag] } print max_count}')
|
||||||
nb_duplicates=\$(cat "${families}" | wc -l)
|
nb_duplicates=\$(cat "${families}" | wc -l)
|
||||||
nb_genes=\$(awk '/^>/ { print \$4 }' "${proteome}" | sort | uniq | wc -l)
|
nb_genes=\$(awk '/^>/ { print \$4 }' "${proteome}" | sort | uniq | wc -l)
|
||||||
nb_singletons=\$((nb_genes - nb_duplicates + 1))
|
nb_singletons=\$((nb_genes - nb_duplicates + 1))
|
||||||
echo "${species},\${nb_genes},\${nb_duplicates},\${nb_singletons},\${nb_families},\${nb_tag}" > "${species}.csv"
|
echo "${species},\${nb_genes},\${nb_duplicates},\${nb_singletons},\${nb_families},\${nb_genes_largest_family},\${nb_tag0},\${nb_tag1},\${nb_tag_genes0},\${nb_tag_genes1},\${nb_genes_largest_tag0}" > "${species}.csv"
|
||||||
"""
|
"""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -43,4 +48,4 @@ workflow {
|
||||||
CLUSTERING(FILTER_BLASTP.out.blastp)
|
CLUSTERING(FILTER_BLASTP.out.blastp)
|
||||||
TAGs(CLUSTERING.out, GUNZIP_2.out)
|
TAGs(CLUSTERING.out, GUNZIP_2.out)
|
||||||
CSV_ROW(params.species, GUNZIP_1.out, TAGs.out, CLUSTERING.out)
|
CSV_ROW(params.species, GUNZIP_1.out, TAGs.out, CLUSTERING.out)
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue