A workflow to count genes, duplicates, singletons and TAG
This commit is contained in:
parent
3e17d75618
commit
18e9e4f56a
|
@ -0,0 +1,10 @@
|
||||||
|
species,proteome,gff3,blastp
|
||||||
|
Glycine max,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.60.chr.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max_Blastp_longIsoforme
|
||||||
|
Gossypium raimondii,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii.Graimondii2_0_v6.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii.Graimondii2_0_v6.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Gossypium_raimondii_Blastp_longIsoforme
|
||||||
|
Malus domestica Golden,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden.ASM211411v1.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden.ASM211411v1.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Malus_domestica_golden_Blastp_longIsoforme2
|
||||||
|
Musa acuminata,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Musa_acuminata.Musa_acuminata_v2.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Musa_acuminata.Musa_acuminata_v2.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Musa_acuminata_Blastp_longIsoforme
|
||||||
|
Oryza sativa,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Oryza_sativa.IRGSP-1.0.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Oryza_sativa.IRGSP-1.0.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Oryza_sativa_Blastp_longIsoforme
|
||||||
|
Prunus persica,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica.Prunus_persica_NCBIv2.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica.Prunus_persica_NCBIv2.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Prunus_persica_Blastp_longIsoforme
|
||||||
|
Solanum tuberosum,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum.SolTub_3.0.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum.SolTub_3.0.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Solanum_tuberosum_Blastp_longIsoforme
|
||||||
|
Vigna radiata,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata.Vradiata_ver6.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata.Vradiata_ver6.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Vigna_radiata_Blastp_longIsoforme
|
||||||
|
Theobroma cacao,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao.Theobroma_cacao_20110822.pep.all.fa.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao.Theobroma_cacao_20110822.60.gff3.gz,/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Theobroma_cacao_Blastp_longIsoforme
|
|
|
@ -36,7 +36,7 @@ process MCL {
|
||||||
|
|
||||||
script:
|
script:
|
||||||
"""
|
"""
|
||||||
mcl "${abc}" --abc -o 'custering.mcl'
|
mcl "${abc}" --abc -o 'clustering.mcl'
|
||||||
"""
|
"""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,23 @@
|
||||||
/** Filter blastp output based on coverage and identity percentage
|
/** Filter blastp output based on coverage and identity percentage
|
||||||
/**/
|
/**/
|
||||||
|
|
||||||
|
process REMOVE_SUPERCONTIGS_CHLOROPLASTS_AND_MITOCHONDRIA {
|
||||||
|
input:
|
||||||
|
path proteome
|
||||||
|
path blastp
|
||||||
|
output:
|
||||||
|
path 'blastp_no_supercontigs_chloroplast_nor_mitochondria.tsv'
|
||||||
|
script:
|
||||||
|
"""
|
||||||
|
awk -f "${baseDir}/scripts/black_gene_list.awk" "${proteome}" > 'blacklist.list'
|
||||||
|
if [ ! -s 'blacklist.list' ]; then
|
||||||
|
cp "${blastp}" 'blastp_no_supercontigs_chloroplast_nor_mitochondria.tsv'
|
||||||
|
else
|
||||||
|
awk -f "${baseDir}/scripts/filter_blastp_sequence_id_remove.awk" 'blacklist.list' "${blastp}" > 'blastp_no_supercontigs_chloroplast_nor_mitochondria.tsv'
|
||||||
|
fi
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
process REMOVE_LOOPS {
|
process REMOVE_LOOPS {
|
||||||
|
|
||||||
input:
|
input:
|
||||||
|
@ -12,7 +29,7 @@ process REMOVE_LOOPS {
|
||||||
script:
|
script:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
awk 'BEGIN { FS=OFS="\t" } \$1 != \$2 { print \$0 }'
|
awk 'BEGIN { FS=OFS="\t" } \$1 != \$2 { print \$0 }' "${blastp}" > 'noloops.blastp.tsv'
|
||||||
"""
|
"""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -27,11 +44,11 @@ process ADD_GENE_ID_AND_PROTEIN_LENGTHS {
|
||||||
|
|
||||||
script:
|
script:
|
||||||
"""
|
"""
|
||||||
sort -k 1 "${blastp}" > blastp_s
|
LANG=en_EN sort -t \$'\t' -k 1 "${blastp}" > blastp_s
|
||||||
sort -k 1 "${protein_length}" > protein_length_s
|
LANG=en_EN sort -t \$'\t' -k 1 "${protein_length}" > protein_length_s
|
||||||
join -1 1 -2 1 -t \$'\t' blastp_s protein_length_s > join1.tsv
|
LANG=en_EN join -1 1 -2 1 -t \$'\t' blastp_s protein_length_s > join1.tsv
|
||||||
sort -k 2 join1.tsv > join1.tsv_s
|
LANG=en_EN sort -t \$'\t' -k 2 join1.tsv > join1.tsv_s
|
||||||
join -1 2 -2 1 -t \$'\t' join1.tsv_s protein_length_s > joined.blastp.tsv
|
LANG=en_EN join -1 2 -2 1 -t \$'\t' join1.tsv_s protein_length_s > joined.blastp.tsv
|
||||||
"""
|
"""
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -39,9 +56,9 @@ process ADD_GENE_ID_AND_PROTEIN_LENGTHS {
|
||||||
process FILTER_COVERAGE_IDENTITY_BLASTP {
|
process FILTER_COVERAGE_IDENTITY_BLASTP {
|
||||||
|
|
||||||
input:
|
input:
|
||||||
|
path blastp
|
||||||
val min_coverage
|
val min_coverage
|
||||||
val min_identity
|
val min_identity
|
||||||
path blastp
|
|
||||||
|
|
||||||
output:
|
output:
|
||||||
path 'filtered.blastp.tsv'
|
path 'filtered.blastp.tsv'
|
||||||
|
@ -58,15 +75,17 @@ process FILTER_COVERAGE_IDENTITY_BLASTP {
|
||||||
workflow FILTER_BLASTP {
|
workflow FILTER_BLASTP {
|
||||||
take:
|
take:
|
||||||
blastp
|
blastp
|
||||||
|
proteome
|
||||||
protein_length
|
protein_length
|
||||||
min_coverage
|
min_coverage
|
||||||
min_identity
|
min_identity
|
||||||
|
|
||||||
main:
|
main:
|
||||||
REMOVE_LOOPS(blastp)
|
REMOVE_SUPERCONTIGS_CHLOROPLASTS_AND_MITOCHONDRIA(proteome, blastp)
|
||||||
ADD_GENE_ID_AND_PROTEIN_LENGTHS(REMOVE_LOOPS.out)
|
REMOVE_LOOPS(REMOVE_SUPERCONTIGS_CHLOROPLASTS_AND_MITOCHONDRIA.out)
|
||||||
FILTER_COVERAGE_IDENTITY_BLASTP(min_identity, min_coverage, ADD_GENE_ID_AND_PROTEIN_LENGTHS.out)
|
ADD_GENE_ID_AND_PROTEIN_LENGTHS(REMOVE_LOOPS.out, protein_length)
|
||||||
|
FILTER_COVERAGE_IDENTITY_BLASTP(ADD_GENE_ID_AND_PROTEIN_LENGTHS.out, min_identity, min_coverage)
|
||||||
|
|
||||||
emit:
|
emit:
|
||||||
blastp=FILTER_COVERAGE_IDENTITY_BLASTP
|
blastp=FILTER_COVERAGE_IDENTITY_BLASTP.out
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,16 +1,14 @@
|
||||||
/** 1. Filter out proteome sequence belonging to 'supercontig' */
|
/** 1. Filter out proteome sequence belonging to 'supercontig', Mt, or Pt */
|
||||||
/** 2. Filter proteome to keep only the longest isoform */
|
/** 2. Filter proteome to keep only the longest isoform */
|
||||||
|
|
||||||
process FILTER_SUPERCONTIG {
|
process REMOVE_SUPERCONTIGS_CHLOROPLASTS_AND_MITOCHONDRIA {
|
||||||
input:
|
input:
|
||||||
path proteome
|
path proteome
|
||||||
|
|
||||||
output:
|
output:
|
||||||
path 'proteome_nosupercontig.fasta'
|
path 'filtered_proteome.fasta'
|
||||||
|
|
||||||
script:
|
script:
|
||||||
"""
|
"""
|
||||||
awk -f "${baseDir}/scripts/remove_supercontigs.awk" "${proteome}" > 'proteome_nosupercontig.fasta'
|
awk -f "${baseDir}/scripts/remove_supercontigs_chloroplasts_mitochondria_fasta.awk" "${proteome}" > 'filtered_proteome.fasta'
|
||||||
"""
|
"""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -42,14 +40,16 @@ process PROTEOME_LONGEST {
|
||||||
"""
|
"""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
workflow FILTER_FASTA {
|
workflow FILTER_FASTA {
|
||||||
take:
|
take:
|
||||||
proteome
|
proteome
|
||||||
|
|
||||||
main:
|
main:
|
||||||
FILTER_SUPERCONTIG(proteome)
|
REMOVE_SUPERCONTIGS_CHLOROPLASTS_AND_MITOCHONDRIA(proteome)
|
||||||
SEQUENCE_LENGTH(FILTER_SUPERCONTIG.out)
|
SEQUENCE_LENGTH(REMOVE_SUPERCONTIGS_CHLOROPLASTS_AND_MITOCHONDRIA.out)
|
||||||
PROTEOME_LONGEST(FILTER_SUPERCONTIG.out, SEQUENCE_LENGTH.out)
|
PROTEOME_LONGEST(REMOVE_SUPERCONTIGS_CHLOROPLASTS_AND_MITOCHONDRIA.out, SEQUENCE_LENGTH.out)
|
||||||
|
|
||||||
emit:
|
emit:
|
||||||
lengths = SEQUENCE_LENGTH.out
|
lengths = SEQUENCE_LENGTH.out
|
||||||
|
|
|
@ -9,7 +9,7 @@ process TAG_FINDER {
|
||||||
|
|
||||||
script:
|
script:
|
||||||
"""
|
"""
|
||||||
"./${baseDir}/../../rust/tagfinder/target/release/tagfinder" --positions "${positions}" --families "${families}" > 'tags.tsv'
|
"${baseDir}/../../rust/tagfinder/target/release/tagfinder" --positions "${positions}" --families "${families}" --definitions 0 > 'tags.tsv'
|
||||||
"""
|
"""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -32,8 +32,8 @@ process GENE_POSITION_TABLE {
|
||||||
workflow TAGs {
|
workflow TAGs {
|
||||||
|
|
||||||
take:
|
take:
|
||||||
gff3
|
|
||||||
families
|
families
|
||||||
|
gff3
|
||||||
|
|
||||||
main:
|
main:
|
||||||
GENE_POSITION_TABLE(gff3)
|
GENE_POSITION_TABLE(gff3)
|
||||||
|
|
|
@ -0,0 +1,10 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euo pipefail
|
||||||
|
tail -n +2 input.csv | while IFS= read -r line; do
|
||||||
|
species="$(echo $line | cut -d ',' -f 1)"
|
||||||
|
proteome="$(echo $line | cut -d ',' -f 2)"
|
||||||
|
gff3="$(echo $line | cut -d ',' -f 3)"
|
||||||
|
blastp="$(echo $line | cut -d ',' -f 4)"
|
||||||
|
echo Launching for $species
|
||||||
|
nextflow run run_one.nf -resume -profile conda --species "$species" --proteome "$proteome" --gff3 "$gff3" --blastp "$blastp"
|
||||||
|
done
|
|
@ -0,0 +1,46 @@
|
||||||
|
/** Run the workflow for all species in the csv table */
|
||||||
|
|
||||||
|
include { GUNZIP as GUNZIP_1 } from "./modules/gunzip.nf"
|
||||||
|
include { GUNZIP as GUNZIP_2 } from "./modules/gunzip.nf"
|
||||||
|
include { FILTER_FASTA } from "./modules/filter_fasta.nf"
|
||||||
|
include { FILTER_BLASTP } from "./modules/filter_blastp.nf"
|
||||||
|
include { CLUSTERING } from "./modules/clustering.nf"
|
||||||
|
include { TAGs } from "./modules/tags.nf"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
process CSV_ROW {
|
||||||
|
publishDir "results/", mode: 'copy'
|
||||||
|
input:
|
||||||
|
val(species)
|
||||||
|
path(proteome)
|
||||||
|
path(tags)
|
||||||
|
path(families)
|
||||||
|
output:
|
||||||
|
path "${species}.csv"
|
||||||
|
script:
|
||||||
|
"""
|
||||||
|
nb_tag=\$(tail -n +2 "${tags}"| awk 'BEGIN { max = 0 } { if (\$3 != "-" && \$3 > max) max = \$3 } END { print max }')
|
||||||
|
nb_families=\$(tail -1 "${families}" | awk '{ print \$2 }')
|
||||||
|
nb_duplicates=\$(cat "${families}" | wc -l)
|
||||||
|
nb_genes=\$(awk '/^>/ { print \$4 }' "${proteome}" | sort | uniq | wc -l)
|
||||||
|
nb_singletons=\$((nb_genes - nb_duplicates + 1))
|
||||||
|
echo "${species},\${nb_genes},\${nb_duplicates},\${nb_singletons},\${nb_families},\${nb_tag}" > "${species}.csv"
|
||||||
|
"""
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
workflow {
|
||||||
|
proteome = file(params.proteome)
|
||||||
|
gff3 = file(params.gff3)
|
||||||
|
blastp = file(params.blastp)
|
||||||
|
|
||||||
|
GUNZIP_1(proteome)
|
||||||
|
GUNZIP_2(gff3)
|
||||||
|
FILTER_FASTA(GUNZIP_1.out)
|
||||||
|
FILTER_BLASTP(blastp, GUNZIP_1.out, FILTER_FASTA.out.lengths, params.min_coverage, params.min_identity)
|
||||||
|
CLUSTERING(FILTER_BLASTP.out.blastp)
|
||||||
|
TAGs(CLUSTERING.out, GUNZIP_2.out)
|
||||||
|
CSV_ROW(params.species, GUNZIP_1.out, TAGs.out, CLUSTERING.out)
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
#!/usr/bin/env -S awk -f
|
||||||
|
|
||||||
|
/^>/ {
|
||||||
|
locus=$3
|
||||||
|
split(":", arr, locus)
|
||||||
|
chromosome=arr[3]
|
||||||
|
if (chromosome == "Mt" || chromosome == "Pt" || chromosome ~ /supercontig/) {
|
||||||
|
record_id=$1
|
||||||
|
gsub(">", "", record_id)
|
||||||
|
print record_id
|
||||||
|
}
|
||||||
|
}
|
|
@ -9,13 +9,13 @@
|
||||||
|
|
||||||
NR == FNR {
|
NR == FNR {
|
||||||
sequence_id = $1
|
sequence_id = $1
|
||||||
to_remove[sequence_id] = 1
|
kept[sequence_id] = 1
|
||||||
next
|
next
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
sequence_id = $1
|
sequence_id = $1
|
||||||
if (!(sequence_id in to_remove)) {
|
if (sequence_id in kept || $2 in kept) {
|
||||||
print $0
|
print $0
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -0,0 +1,25 @@
|
||||||
|
#!/usr/bin/env -S awk -f
|
||||||
|
# Filter BLASTp format 6 file
|
||||||
|
# to keep only the records
|
||||||
|
# with no ID in records.list
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# awk -f filter_blastp_sequence_id.awk \
|
||||||
|
# records.list records.blastp.tsv
|
||||||
|
|
||||||
|
BEGIN {
|
||||||
|
FS="\t"
|
||||||
|
OFS="\t"
|
||||||
|
}
|
||||||
|
|
||||||
|
NR==FNR {
|
||||||
|
sequence_id=$1
|
||||||
|
kept[sequence_id]=1
|
||||||
|
next
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
if (!($1 in kept) && !($2 in kept)) {
|
||||||
|
print $0
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,10 +1,10 @@
|
||||||
#!/usr/bin/env -S awk -f
|
#!/usr/bin/env -S awk -f
|
||||||
# Filter FASTA file
|
# Filter FASTA file
|
||||||
# to keep only the records
|
# to keep or remove only the records
|
||||||
# with ID in records.list
|
# with ID in records.list
|
||||||
#
|
#
|
||||||
# Usage:
|
# Usage:
|
||||||
# awk -f filter_record_fasta.awk \
|
# awk -f (-v direction=keep|remove) filter_record_fasta.awk \
|
||||||
# records.list records.fasta
|
# records.list records.fasta
|
||||||
|
|
||||||
BEGIN {
|
BEGIN {
|
||||||
|
@ -22,9 +22,16 @@ NR == FNR {
|
||||||
sequence_id = $1
|
sequence_id = $1
|
||||||
gsub(">", "", sequence_id)
|
gsub(">", "", sequence_id)
|
||||||
if (sequence_id in kept) {
|
if (sequence_id in kept) {
|
||||||
|
if (direction == "keep") {
|
||||||
print $0
|
print $0
|
||||||
on_good_record = 1
|
on_good_record = 1
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
if (direction == "remove") {
|
||||||
|
print $0
|
||||||
|
on_good_record = 1
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/^[^>]/ && on_good_record {
|
/^[^>]/ && on_good_record {
|
||||||
|
|
|
@ -1,25 +0,0 @@
|
||||||
#!/usr/bin/env -S awk -f
|
|
||||||
# Filter out 'supercontigs' records
|
|
||||||
# from an Ensembl proteome fasta file
|
|
||||||
# Usage:
|
|
||||||
# awk -f remove_supercontigs.awk \
|
|
||||||
# "proteome.fasta" \
|
|
||||||
# > "proteome_nosupercontigs.fasta"
|
|
||||||
|
|
||||||
BEGIN {
|
|
||||||
on_good_sequence = 0
|
|
||||||
}
|
|
||||||
|
|
||||||
/^>/ {
|
|
||||||
on_good_sequence = 0
|
|
||||||
locus = $3
|
|
||||||
split(locus, locus_array, ":")
|
|
||||||
if (locus_array[1] != "supercontig") {
|
|
||||||
on_good_sequence = 1
|
|
||||||
print $0
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/^[^>]/ && on_good_sequence {
|
|
||||||
print $0
|
|
||||||
}
|
|
|
@ -0,0 +1,21 @@
|
||||||
|
#!/usr/bin/env -S awk -f
|
||||||
|
|
||||||
|
BEGIN {
|
||||||
|
on_good_sequence = 0
|
||||||
|
}
|
||||||
|
|
||||||
|
/^>/ {
|
||||||
|
locus=$3
|
||||||
|
split(locus, arr, ":")
|
||||||
|
chromosome=arr[3]
|
||||||
|
if (chromosome == "Mt" || chromosome == "Pt" || chromosome ~ /supercontig/) {
|
||||||
|
on_good_sequence = 0
|
||||||
|
} else {
|
||||||
|
on_good_sequence = 1
|
||||||
|
print $0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/^[^>]/ && on_good_sequence == 1 {
|
||||||
|
print $0
|
||||||
|
}
|
Loading…
Reference in New Issue