Update workflow
This commit is contained in:
parent
0540e3699e
commit
53f6a60943
|
@ -8,18 +8,26 @@
|
||||||
nextflow.enable.dsl = 2;
|
nextflow.enable.dsl = 2;
|
||||||
|
|
||||||
include { GUNZIP } from "./modules/gunzip.nf"
|
include { GUNZIP } from "./modules/gunzip.nf"
|
||||||
include { BLAST_MAKEBLASTDB } from "./modules/blast.nf"
|
include { BLAST_ALL_AGAINST_ALL } from "./modules/blast.nf"
|
||||||
include { BLAST_BLASTP } from "./modules/blast.nf"
|
|
||||||
include { FILTER_FASTA } from "./modules/filter_fasta.nf"
|
include { FILTER_FASTA } from "./modules/filter_fasta.nf"
|
||||||
include { FILTER_BLASTP } from "./modules/filter_blastp.nf"
|
include { FILTER_BLASTP } from "./modules/filter_blastp.nf"
|
||||||
include { CLUSTERING } from "./modules/clustering.nf"
|
include { CLUSTERING } from "./modules/clustering.nf"
|
||||||
|
|
||||||
|
process PROTEIN_GENE_MAPPING {
|
||||||
|
|
||||||
|
input:
|
||||||
|
path proteome
|
||||||
|
|
||||||
|
output:
|
||||||
|
path 'protein_gene.tsv'
|
||||||
|
}
|
||||||
|
|
||||||
workflow {
|
workflow {
|
||||||
proteome = Channel.fromPath(params.proteome)
|
proteome = Channel.fromPath(params.proteome)
|
||||||
GUNZIP(proteome)
|
GUNZIP(proteome)
|
||||||
FILTER_FASTA(GUNZIP.out)
|
FILTER_FASTA(GUNZIP.out)
|
||||||
BLAST_MAKEBLASTDB(params.species, FILTER_FASTA.out.proteome)
|
BLAST_ALL_AGAINST_ALL(FILTER_FASTA.out.proteome)
|
||||||
BLAST_BLASTP(params.species, FILTER_FASTA.out.proteome, BLAST_MAKEBLASTDB.out)
|
FILTER_BLASTP(params.min_coverage, params.min_identity, BLAST_ALL_AGAINST_ALL.out, FILTER_FASTA.out.lengths)
|
||||||
FILTER_BLASTP(BLAST_BLASTP.out, FILTER_FASTA.out.protein_length)
|
|
||||||
CLUSTERING(FILTER_BLASTP.out)
|
CLUSTERING(FILTER_BLASTP.out)
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,6 +34,19 @@ process BLAST_BLASTP {
|
||||||
|
|
||||||
script:
|
script:
|
||||||
"""
|
"""
|
||||||
blastp -query "${proteome}" -db 'db' -outfmt '6' -out "${species}.all-against-all.blastp.tsv"
|
blastp -query "${proteome}" -db 'db' -outfmt '6' -num_threads 7 -out "${species}.all-against-all.blastp.tsv"
|
||||||
"""
|
"""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
workflow BLAST_ALL_AGAINST_ALL {
|
||||||
|
|
||||||
|
take:
|
||||||
|
proteome
|
||||||
|
|
||||||
|
main:
|
||||||
|
BLAST_MAKEBLASTDB(params.species, proteome)
|
||||||
|
BLAST_BLASTP(params.species, proteome, BLAST_MAKEBLASTDB.out)
|
||||||
|
|
||||||
|
emit:
|
||||||
|
BLAST_BLASTP.out
|
||||||
|
}
|
||||||
|
|
|
@ -46,7 +46,7 @@ workflow CLUSTERING {
|
||||||
|
|
||||||
main:
|
main:
|
||||||
BLASTP_TO_ABC(blastp_tsv)
|
BLASTP_TO_ABC(blastp_tsv)
|
||||||
MCL(BLASTP_TO_ABC).out
|
MCL(BLASTP_TO_ABC.out)
|
||||||
MCL_TO_TSV(MCL.out)
|
MCL_TO_TSV(MCL.out)
|
||||||
|
|
||||||
emit:
|
emit:
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
/** Filter blastp output based on coverage and identity percentage
|
/** Filter blastp output based on coverage and identity percentage
|
||||||
/**/
|
/**/
|
||||||
|
|
||||||
|
|
||||||
process FILTER_BLASTP {
|
process FILTER_BLASTP {
|
||||||
|
|
||||||
input:
|
input:
|
||||||
|
@ -8,18 +9,21 @@ process FILTER_BLASTP {
|
||||||
val min_identity
|
val min_identity
|
||||||
path blastp
|
path blastp
|
||||||
path protein_length
|
path protein_length
|
||||||
|
path protein_gene
|
||||||
|
|
||||||
output:
|
output:
|
||||||
path 'filtered_blastp.tsv'
|
path 'filtered_blastp.tsv'
|
||||||
|
|
||||||
script:
|
script:
|
||||||
"""
|
"""
|
||||||
sort "${blastp}" > blastp_s
|
sort -k 1 "${blastp}" > blastp_s
|
||||||
sort "${protein_length}" > protein_length_s
|
sort -k 1 "${protein_length}" > protein_length_s
|
||||||
join -1 1 -2 1 "blastp_s" "protein_length_s" > join1.tsv
|
join -1 1 -2 1 -t'\t' blastp_s protein_length_s > join1.tsv
|
||||||
sort -k 2 "join1.tsv" > join1.tsv_s
|
sort -k 2 join1.tsv > join1.tsv_s
|
||||||
join -1 2 -2 1 "join1.tsv_s" "protein_length_s" > 'joined.blastp.tsv'
|
join -1 2 -2 1 -t'\t' join1.tsv_s' 'protein_length_s' > 'joined.blastp.tsv'
|
||||||
awk -f "${baseDir}/scripts/filter_blastp.awk" -v coverage="${min_coverage}" -v identity "${min_identity}" \
|
awk -f "${baseDir}/scripts/filter_blastp.awk" \
|
||||||
|
-v coverage="${min_coverage}" \
|
||||||
|
-v identity="${min_identity}" \
|
||||||
"${blastp}" > 'filtered_blastp.tsv'
|
"${blastp}" > 'filtered_blastp.tsv'
|
||||||
"""
|
"""
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,21 @@
|
||||||
|
#!/usr/bin/env -S awk -f
|
||||||
|
# Filter BLASTp format 6 file
|
||||||
|
# to keep only the records
|
||||||
|
# with ID in records.list
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# awk -f filter_blastp_sequence_id.awk \
|
||||||
|
# records.list records.blastp.tsv
|
||||||
|
|
||||||
|
NR == FNR {
|
||||||
|
sequence_id = $1
|
||||||
|
to_remove[sequence_id] = 1
|
||||||
|
next
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
sequence_id = $1
|
||||||
|
if (!(sequence_id in to_remove)) {
|
||||||
|
print $0
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,30 @@
|
||||||
|
#!/usr/bin/env -S awk -f
|
||||||
|
# Usage: $0 file.abc > filtered_file.abc
|
||||||
|
BEGIN {
|
||||||
|
OFS = FS = "\t"
|
||||||
|
}
|
||||||
|
{
|
||||||
|
a=$1
|
||||||
|
b=$2
|
||||||
|
c=$3
|
||||||
|
if (a > b) {
|
||||||
|
tmp=a
|
||||||
|
a=b
|
||||||
|
b=tmp
|
||||||
|
}
|
||||||
|
if (!(b in graph[a])) {
|
||||||
|
graph[a][b] = c
|
||||||
|
} else {
|
||||||
|
if (graph[a][b] < c) {
|
||||||
|
graph[a][b] = c
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
END {
|
||||||
|
for (a in graph) {
|
||||||
|
for (b in graph[a]) {
|
||||||
|
print a, b, graph[a][b]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -5,12 +5,12 @@
|
||||||
BEGIN {
|
BEGIN {
|
||||||
family_identifier=0
|
family_identifier=0
|
||||||
OFS="\t"
|
OFS="\t"
|
||||||
|
FS=" "
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
family_identifier++
|
family_identifier++
|
||||||
split($0, gene_list, " ")
|
for (i=1; i <= NF; i++) {
|
||||||
for (gene in gene_list) {
|
print $i, family_identifier
|
||||||
print gene, family_identifier
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,19 +1,25 @@
|
||||||
#!/usr/bin/env -S awk -f
|
#!/usr/bin/env -S awk -f
|
||||||
# Associate a isoform id to the length of the sequence
|
# Associate a isoform id
|
||||||
|
# to the length of the sequence
|
||||||
|
# and the associated gene id
|
||||||
|
|
||||||
BEGIN {
|
BEGIN {
|
||||||
sequence_length=0
|
sequence_length=0
|
||||||
isoform_id=""
|
isoform_id=""
|
||||||
|
gene_id=""
|
||||||
|
|
||||||
|
OFS="\t"
|
||||||
}
|
}
|
||||||
|
|
||||||
/^>/ {
|
/^>/ {
|
||||||
if (isoform_id != "") {
|
if (isoform_id != "") {
|
||||||
print isoform_id, sequence_length
|
print isoform_id, sequence_length, gene_id
|
||||||
} else {
|
|
||||||
isoform_id = $1
|
|
||||||
gsub(">", "", isoform_id)
|
|
||||||
sequence_length = 0
|
|
||||||
}
|
}
|
||||||
|
isoform_id = $1
|
||||||
|
gene_id = $4
|
||||||
|
gsub(">", "", isoform_id)
|
||||||
|
gsub("gene:", "", gene_id)
|
||||||
|
sequence_length = 0
|
||||||
}
|
}
|
||||||
|
|
||||||
/^[^>]/ {
|
/^[^>]/ {
|
||||||
|
@ -22,6 +28,6 @@ BEGIN {
|
||||||
|
|
||||||
END {
|
END {
|
||||||
if (isoform_id != "") {
|
if (isoform_id != "") {
|
||||||
print isoform_id, sequence_length
|
print isoform_id, sequence_length, gene_id
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue