feat: Exercise considered done

This commit is contained in:
Samuel Ortion 2023-10-01 12:26:59 +02:00
commit b5cdf0a828
14 changed files with 499 additions and 0 deletions

1
.gitattributes vendored Normal file
View File

@ -0,0 +1 @@
data filter=lfs diff=lfs merge=lfs -text

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
build
Testing

39
CMakeLists.txt Normal file
View File

@ -0,0 +1,39 @@
cmake_minimum_required(VERSION 3.9)
project(dnapp)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g3 -ggdb")
set(SRCS src/main.cpp)
add_executable(${PROJECT_NAME} ${SRCS})
if(MSVC)
target_compile_options(${PROJECT_NAME} PRIVATE /W4 /WX)
else()
target_compile_options(${PROJECT_NAME} PRIVATE -Wall -Wextra -Wpedantic -Werror)
endif()
# Google Test
include(FetchContent)
FetchContent_Declare(
googletest
URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip
)
# For Windows: Prevent overriding the parent project's compiler/linker settings
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(googletest)
enable_testing()
add_executable(
Sequence_test
tests/Sequence_test.cpp
)
target_link_libraries(
Sequence_test
GTest::gtest_main
)
include(GoogleTest)
gtest_discover_tests(Sequence_test)

22
Makefile Normal file
View File

@ -0,0 +1,22 @@
all: build run
build:
cmake -S . -B build -DCMAKE_BUILD_TYPE=Release
cmake --build build
conan-install:
conan install . --output-folder=build --build=missing
conan-build: conan-install
cmake -S . -B build/ -DCMAKE_TOOLCHAIN_FILE=conan_toolchain.cmake -DCMAKE_BUILD_TYPE=Release
cmake --build build
run:
./build/dnapp -i ./data/sample.fa
test:
cmake -S . -B build
cmake --build build
cd build && ctest --output-on-failure
.PHONY: build

24
README.md Normal file
View File

@ -0,0 +1,24 @@
# DNA++
## Instructions
1. Create a base class Sequence that represents a generic sequence. Include the following attributes and methods:
1. Attributes:
1. sequence: A string representing the sequence itself.
2. Methods:
1. `__init__(self, sequence)`: Initialize the sequence.
2. `get_length(self)`: Return the length of the sequence.
3. `get_sequence(self)`: Return the sequence as a string.
2. Create a subclass DNASequence that inherits from the Sequence class. Add the following methods:
1. `get_complement(self)`: Return the complementary DNA sequence. A pairs with T, and C pairs with G.
2. `transcribe_to_rna(self)`: Return the RNA sequence by replacing all T's with U's.
3. Create another subclass ProteinSequence that inherits from the Sequence class. Add a method `translate_to_protein(self)` that translates the DNA sequence into a protein
sequence using the genetic code (you can use a dictionary to map codons to amino acids).
1. Provide an example DNA sequence and demonstrate the functionality of your program by:
1. Creating a DNA sequence object.
2. Getting its complement and transcribed RNA sequence.
3. Translating it into a protein sequence.
Hints:
* You can create a dictionary to represent the genetic code, where keys are codons (triplets of nucleotides), and values are the corresponding amino acids.
* The complement of a DNA sequence can be found by replacing A with T, T with A, C with G, and G with C.

2
data/sample.fa Normal file
View File

@ -0,0 +1,2 @@
>BBa_J45004 SAM:benzoic acid/salicylic acid carboxyl methyltransferase I; converts salicylic acid to methyl sali
atggaagttgttgaagttcttcacatgaatggaggaaatggagacagtagctatgcaaacaattctttggttcagcaaaaggtgattctcatgacaaagccaataactgagcaagccatgattgatctctacagcagcctctttccagaaaccttatgcattgcagatttgggttgttctttgggagctaacactttcttggtggtctcacagcttgttaaaatagtagaaaaagaacgaaaaaagcatggttttaagtctccagagttttattttcacttcaatgatcttcctggcaatgattttaatacactttttcagtcactgggggcatttcaagaagatttgagaaagcatataggggaaagctttggtccatgttttttcagtggagtgcctggttcattttatactagacttttcccttccaaaagtttacattttgtttactcctcctacagtctcatgtggctatctcaggtgcctaatgggattgaaaataacaagggaaacatttacatggcaagaacaagccctctaagtgttattaaagcatactacaagcaatatgaaatagatttttcaaattttctcaagtaccgttcagaggaattgatgaaaggtggaaagatggtgttaacactcctaggtagagaaagtgaggatcctactagcaaagaatgctgttacatttgggagcttctagccatggccctcaataagttggttgaagagggattgataaaagaagagaaagtagatgcattcaatattcctcaatacacaccatcaccagcagaagtaaagtacatagttgagaaggaaggatcattcaccattaatcgcttggaaacatcaagagttcattggaatgcttctaataatgagaagaatggtggttacaatgtgtcaaggtgcatgagagctgtggctgagcctttgcttgtcagccactttgacaaggaattgatggatttagtgttccacaagtacgaagagattgtttctgattgcatgtccaaagagaatactgagtttataaatgtcatcatctccttgaccaaaataaattaa

98
include/GeneticCode.hpp Normal file
View File

@ -0,0 +1,98 @@
/** Implement the genetic code as a thread-safe GeneticCode.
*
* @see https://refactoring.guru/design-patterns/GeneticCode/cpp/example#example-1
*/
#pragma once
#include <algorithm>
#include <complex>
#include <fstream>
#include <sstream>
#include <iostream>
#include <stdexcept>
#include <unordered_map>
#include <vector>
#include <mutex>
#include "Sequence.hpp"
#include "strutils.hpp"
class GeneticCode {
private:
static GeneticCode *pinstance_;
static std::mutex mutex_;
protected:
std::unordered_map<std::string, char> codon_table;
GeneticCode(): GeneticCode("./resources/codons.txt") {}
GeneticCode(std::string input_filename) {
std::ifstream input(input_filename);
if (!input.good()) {
throw std::invalid_argument("Error: could not read codon table from " + input_filename + " file.");
}
load_genetic_code(input);
}
void load_genetic_code(std::ifstream &input) {
if (!input.good()) {
throw std::invalid_argument("Error: Could not read the codon table from the text file.");
}
std::string line;
while(input.good()) {
std::getline(input, line);
std::vector<std::string> fields = split_string(line, '\t');
std::string codon = fields.at(0);
char amino_acid = fields.at(2)[0];
GeneticCode::codon_table.insert({codon, amino_acid});
}
}
public:
GeneticCode(GeneticCode &other) = delete;
void operator=(const GeneticCode &) = delete;
static GeneticCode *GetInstance();
static GeneticCode *GetInstance(std::string input_filename);
char get_amino_acid(std::string codon) {
replace_all(codon, 'U', 'T');
if (codon_table.find(codon) != codon_table.end()) {
return codon_table.at(codon);
} else {
throw std::invalid_argument("Error: codon '" + codon + "' not found in genetic code");
}
}
};
GeneticCode* GeneticCode::pinstance_{nullptr};
std::mutex GeneticCode::mutex_;
GeneticCode *GeneticCode::GetInstance()
{
std::lock_guard<std::mutex> lock(mutex_);
if (pinstance_ == nullptr)
{
pinstance_ = new GeneticCode();
}
return pinstance_;
}
GeneticCode *GeneticCode::GetInstance(std::string input_filename)
{
std::lock_guard<std::mutex> lock(mutex_);
if (pinstance_ == nullptr)
{
pinstance_ = new GeneticCode(input_filename);
}
return pinstance_;
}

92
include/Sequence.hpp Normal file
View File

@ -0,0 +1,92 @@
#pragma once
#include <vector>
#include <fstream>
#include <sstream>
#include <stdexcept>
#include <string>
#include <unordered_map>
#include "GeneticCode.hpp"
#include "strutils.hpp"
class Sequence{
public:
Sequence(std::string sequence) {
this->sequence = sequence;
}
unsigned int get_length() {
return this->sequence.length();
}
std::string get_sequence() {
return this->sequence;
}
protected:
std::string sequence;
};
class ProteinSequence: public Sequence {
using Sequence::Sequence;
};
class RNASequence: public Sequence {
using Sequence::Sequence;
public:
ProteinSequence translate_to_protein() {
GeneticCode *code = GeneticCode::GetInstance();
std::string protein_sequence = "";
for (unsigned int i=0; i < this->sequence.length()-2; i+=3) {
std::string codon = this->sequence.substr(i, 3);
char amino_acid = code->get_amino_acid(codon);
protein_sequence += amino_acid;
}
return ProteinSequence(protein_sequence);
}
};
class DNASequence: public Sequence {
using Sequence::Sequence;
public:
DNASequence get_complement() {
std::string complement = "";
for (char &c: this->sequence) {
complement += DNASequence::complement(c);
}
return DNASequence(complement);
}
static char complement(char c) {
if (c == 'A') {
return 'T';
} else if (c == 'T') {
return 'A';
} else if (c == 'G') {
return 'C';
} else if (c == 'C') {
return 'G';
} else {
throw std::invalid_argument("Error: nucleotide " + std::to_string(c) + " has no complement");
}
}
RNASequence transcribe_to_rna() {
std::string rna_sequence = this->sequence.substr(0);
replace_all(rna_sequence, 'T', 'U');
return RNASequence(rna_sequence);
}
};

54
include/fasta.hpp Normal file
View File

@ -0,0 +1,54 @@
/** Simple FASTA file format reader
*
* @author The BioloGeeks Team
* @date 2023-09-30
*/
#pragma once
#include <iostream>
#include <fstream>
#include <vector>
class Record {
public:
Record(std::string name, std::string seq) {
this->name = name;
this->seq = seq;
}
std::string name, seq;
};
namespace fasta {
/**
* @see https://www.rosettacode.org/wiki/FASTA_format?section=10#C++
*
*/
std::vector<Record> read_file(std::ifstream &input) {
std::string line, name, content;
std::vector<Record> Records;
// Read fasta file lines and append each FASTA records
// to the Records vector.
while (input.good()) {
std::getline(input, line);
if (line[0] == '>') {
if (!name.empty()) {
Record Record(name, content);
Records.push_back(Record);
name.clear();
content.clear();
}
name = line.substr(1);
} else {
content += line;
}
}
if (!name.empty()) {
Record Record(name, content);
Records.push_back(Record);
}
return Records;
}
}

30
include/strutils.hpp Normal file
View File

@ -0,0 +1,30 @@
#pragma once
#include <functional>
#include <string>
#include <vector>
#include <sstream>
void replace_all(std::string &str, char from, char to) {
for (unsigned int i=0; i < str.length(); i++) {
if (str[i] == from) {
str[i] = to;
}
}
}
std::vector<std::string> split_string(std::string &str, char delim) {
std::stringstream sstream(str);
std::vector<std::string> fields;
std::string item;
while (std::getline(sstream, item, delim)) {
fields.push_back(item);
}
return fields;
}
void uppercase_string(std::string &str) {
std::transform(str.cbegin(), str.cend(),
str.begin(), // write to the same location
[](unsigned char c) { return std::toupper(c); });
}

64
resources/codons.txt Normal file
View File

@ -0,0 +1,64 @@
AAA Lys K Lysine
AAC Asn N Asparagine
AAG Lys K Lysine
AAT Asn N Asparagine
ACA Thr T Threonine
ACC Thr T Threonine
ACG Thr T Threonine
ACT Thr T Threonine
AGA Arg R Arginine
AGC Ser S Serine
AGG Arg R Arginine
AGT Ser S Serine
ATA Ile I Isoleucine
ATC Ile I Isoleucine
ATG Met M Methionine
ATT Ile I Isoleucine
CAA Gln Q Glutamine
CAC His H Histidine
CAG Gln Q Glutamine
CAT His H Histidine
CCA Pro P Proline
CCC Pro P Proline
CCG Pro P Proline
CCT Pro P Proline
CGA Arg R Arginine
CGC Arg R Arginine
CGG Arg R Arginine
CGT Arg R Arginine
CTA Leu L Leucine
CTC Leu L Leucine
CTG Leu L Leucine
CTT Leu L Leucine
GAA Glu E Glutamic_acid
GAC Asp D Aspartic_acid
GAG Glu E Glutamic_acid
GAT Asp D Aspartic_acid
GCA Ala A Alanine
GCC Ala A Alanine
GCG Ala A Alanine
GCT Ala A Alanine
GGA Gly G Glycine
GGC Gly G Glycine
GGG Gly G Glycine
GGT Gly G Glycine
GTA Val V Valine
GTC Val V Valine
GTG Val V Valine
GTT Val V Valine
TAA Stp O Stop
TAC Tyr Y Tyrosine
TAG Stp O Stop
TAT Tyr Y Tyrosine
TCA Ser S Serine
TCC Ser S Serine
TCG Ser S Serine
TCT Ser S Serine
TGA Stp O Stop
TGC Cys C Cysteine
TGG Trp W Tryptophan
TGT Cys C Cysteine
TTA Leu L Leucine
TTC Phe F Phenylalanine
TTG Leu L Leucine
TTT Phe F Phenylalanine

53
src/main.cpp Normal file
View File

@ -0,0 +1,53 @@
/** Basic DNA sequence
*/
#include <cstdlib>
#include <fstream>
#include <ios>
#include <iostream>
#include <type_traits>
#include <unistd.h>
#include "../include/Sequence.hpp"
#include "../include/GeneticCode.hpp"
#include "../include/fasta.hpp"
int main(int argc, char *argv[]) {
std::string codon_filename = "resources/codons.txt";
std::string fasta_filename = "";
int opt;
while ((opt = getopt (argc, argv, "i:c:h")) != -1) {
switch(opt) {
case 'i':
fasta_filename = optarg;
break;
case 'c':
codon_filename = optarg;
break;
case '?':
case 'h':
default:
std::cout << "Usage: " << argv[0] << "-i <dna-fasta>" << std::endl;
break;
case -1:
break;
}
}
if (fasta_filename.length() == 0) {
std::cerr << "Error: no fasta file given" << std::endl;
return EXIT_FAILURE;
}
GeneticCode::GetInstance(codon_filename);
std::ifstream fasta_ifstream(fasta_filename);
std::vector<Record> fasta_records = fasta::read_file(fasta_ifstream);
Record fasta_record(fasta_records.at(0));
std::string sequence(fasta_record.seq);
uppercase_string(sequence);
DNASequence dna_sequence(sequence);
RNASequence rna_sequence = dna_sequence.transcribe_to_rna();
ProteinSequence protein_sequence = rna_sequence.translate_to_protein();
std::cout << dna_sequence.get_sequence() << std::endl;
std::cout << rna_sequence.get_sequence() << std::endl;
std::cout << protein_sequence.get_sequence() << std::endl;
return EXIT_SUCCESS;
}

View File

18
tests/Sequence_test.cpp Normal file
View File

@ -0,0 +1,18 @@
#include <gtest/gtest.h>
#include "../include/Sequence.hpp"
TEST(TranscriptionTest, BasicTranscription) {
std::string source = "ATG";
std::string target = "AUG";
std::string result = DNASequence(source).transcribe_to_rna().get_sequence();
EXPECT_EQ(target, result);
}
TEST(TranslationTest, BasicTranslation) {
GeneticCode::GetInstance("../resources/codons.txt");
std::string dna_source = "ATGATG";
std::string aa_target = "MM";
std::string result = DNASequence(dna_source).transcribe_to_rna().translate_to_protein().get_sequence();
EXPECT_EQ(aa_target, result);
}