comparative-genomics-project/workflow/scripts/protein_lengths.awk

34 lines
549 B
Awk
Raw Normal View History

2024-10-28 10:46:05 +01:00
#!/usr/bin/env -S awk -f
2024-11-04 11:37:20 +01:00
# Associate a isoform id
# to the length of the sequence
# and the associated gene id
2024-10-28 10:46:05 +01:00
BEGIN {
sequence_length=0
isoform_id=""
2024-11-04 11:37:20 +01:00
gene_id=""
OFS="\t"
2024-10-28 10:46:05 +01:00
}
/^>/ {
if (isoform_id != "") {
2024-11-04 11:37:20 +01:00
print isoform_id, sequence_length, gene_id
2024-10-28 10:46:05 +01:00
}
2024-11-04 11:37:20 +01:00
isoform_id = $1
gene_id = $4
gsub(">", "", isoform_id)
gsub("gene:", "", gene_id)
sequence_length = 0
2024-10-28 10:46:05 +01:00
}
/^[^>]/ {
sequence_length += length($0)
}
END {
if (isoform_id != "") {
2024-11-04 11:37:20 +01:00
print isoform_id, sequence_length, gene_id
2024-10-28 10:46:05 +01:00
}
}