comparative-genomics-project/workflow/scripts/protein_lengths.awk

34 lines
549 B
Awk

#!/usr/bin/env -S awk -f
# Associate a isoform id
# to the length of the sequence
# and the associated gene id
BEGIN {
sequence_length=0
isoform_id=""
gene_id=""
OFS="\t"
}
/^>/ {
if (isoform_id != "") {
print isoform_id, sequence_length, gene_id
}
isoform_id = $1
gene_id = $4
gsub(">", "", isoform_id)
gsub("gene:", "", gene_id)
sequence_length = 0
}
/^[^>]/ {
sequence_length += length($0)
}
END {
if (isoform_id != "") {
print isoform_id, sequence_length, gene_id
}
}