From b5cdf0a8287f0748460de785639d243ef348d681 Mon Sep 17 00:00:00 2001 From: Samuel Ortion Date: Sun, 1 Oct 2023 12:26:59 +0200 Subject: [PATCH] feat: Exercise considered done --- .gitattributes | 1 + .gitignore | 2 + CMakeLists.txt | 39 +++++++++++++++ Makefile | 22 +++++++++ README.md | 24 ++++++++++ data/sample.fa | 2 + include/GeneticCode.hpp | 98 ++++++++++++++++++++++++++++++++++++++ include/Sequence.hpp | 92 +++++++++++++++++++++++++++++++++++ include/fasta.hpp | 54 +++++++++++++++++++++ include/strutils.hpp | 30 ++++++++++++ resources/codons.txt | 64 +++++++++++++++++++++++++ src/main.cpp | 53 +++++++++++++++++++++ tests/GeneticCode_test.cpp | 0 tests/Sequence_test.cpp | 18 +++++++ 14 files changed, 499 insertions(+) create mode 100644 .gitattributes create mode 100644 .gitignore create mode 100644 CMakeLists.txt create mode 100644 Makefile create mode 100644 README.md create mode 100644 data/sample.fa create mode 100644 include/GeneticCode.hpp create mode 100644 include/Sequence.hpp create mode 100644 include/fasta.hpp create mode 100644 include/strutils.hpp create mode 100644 resources/codons.txt create mode 100644 src/main.cpp create mode 100644 tests/GeneticCode_test.cpp create mode 100644 tests/Sequence_test.cpp diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..4529948 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +data filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..664ec7a --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +build +Testing \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..ab61d76 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,39 @@ +cmake_minimum_required(VERSION 3.9) +project(dnapp) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -g3 -ggdb") + +set(SRCS src/main.cpp) + +add_executable(${PROJECT_NAME} ${SRCS}) + +if(MSVC) + target_compile_options(${PROJECT_NAME} PRIVATE /W4 /WX) +else() + target_compile_options(${PROJECT_NAME} PRIVATE -Wall -Wextra -Wpedantic -Werror) +endif() + +# Google Test +include(FetchContent) +FetchContent_Declare( + googletest + URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip +) +# For Windows: Prevent overriding the parent project's compiler/linker settings +set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) +FetchContent_MakeAvailable(googletest) +enable_testing() + +add_executable( + Sequence_test + tests/Sequence_test.cpp +) +target_link_libraries( + Sequence_test + GTest::gtest_main +) + +include(GoogleTest) +gtest_discover_tests(Sequence_test) diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..92a75cc --- /dev/null +++ b/Makefile @@ -0,0 +1,22 @@ +all: build run + +build: + cmake -S . -B build -DCMAKE_BUILD_TYPE=Release + cmake --build build + +conan-install: + conan install . --output-folder=build --build=missing + +conan-build: conan-install + cmake -S . -B build/ -DCMAKE_TOOLCHAIN_FILE=conan_toolchain.cmake -DCMAKE_BUILD_TYPE=Release + cmake --build build + +run: + ./build/dnapp -i ./data/sample.fa + +test: + cmake -S . -B build + cmake --build build + cd build && ctest --output-on-failure + +.PHONY: build \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..8137556 --- /dev/null +++ b/README.md @@ -0,0 +1,24 @@ +# DNA++ + +## Instructions + +1. Create a base class Sequence that represents a generic sequence. Include the following attributes and methods: + 1. Attributes: + 1. sequence: A string representing the sequence itself. + 2. Methods: + 1. `__init__(self, sequence)`: Initialize the sequence. + 2. `get_length(self)`: Return the length of the sequence. + 3. `get_sequence(self)`: Return the sequence as a string. +2. Create a subclass DNASequence that inherits from the Sequence class. Add the following methods: + 1. `get_complement(self)`: Return the complementary DNA sequence. A pairs with T, and C pairs with G. + 2. `transcribe_to_rna(self)`: Return the RNA sequence by replacing all T's with U's. +3. Create another subclass ProteinSequence that inherits from the Sequence class. Add a method `translate_to_protein(self)` that translates the DNA sequence into a protein +sequence using the genetic code (you can use a dictionary to map codons to amino acids). +1. Provide an example DNA sequence and demonstrate the functionality of your program by: + 1. Creating a DNA sequence object. + 2. Getting its complement and transcribed RNA sequence. + 3. Translating it into a protein sequence. + +Hints: +* You can create a dictionary to represent the genetic code, where keys are codons (triplets of nucleotides), and values are the corresponding amino acids. +* The complement of a DNA sequence can be found by replacing A with T, T with A, C with G, and G with C. \ No newline at end of file diff --git a/data/sample.fa b/data/sample.fa new file mode 100644 index 0000000..0d1f065 --- /dev/null +++ b/data/sample.fa @@ -0,0 +1,2 @@ +>BBa_J45004 SAM:benzoic acid/salicylic acid carboxyl methyltransferase I; converts salicylic acid to methyl sali +atggaagttgttgaagttcttcacatgaatggaggaaatggagacagtagctatgcaaacaattctttggttcagcaaaaggtgattctcatgacaaagccaataactgagcaagccatgattgatctctacagcagcctctttccagaaaccttatgcattgcagatttgggttgttctttgggagctaacactttcttggtggtctcacagcttgttaaaatagtagaaaaagaacgaaaaaagcatggttttaagtctccagagttttattttcacttcaatgatcttcctggcaatgattttaatacactttttcagtcactgggggcatttcaagaagatttgagaaagcatataggggaaagctttggtccatgttttttcagtggagtgcctggttcattttatactagacttttcccttccaaaagtttacattttgtttactcctcctacagtctcatgtggctatctcaggtgcctaatgggattgaaaataacaagggaaacatttacatggcaagaacaagccctctaagtgttattaaagcatactacaagcaatatgaaatagatttttcaaattttctcaagtaccgttcagaggaattgatgaaaggtggaaagatggtgttaacactcctaggtagagaaagtgaggatcctactagcaaagaatgctgttacatttgggagcttctagccatggccctcaataagttggttgaagagggattgataaaagaagagaaagtagatgcattcaatattcctcaatacacaccatcaccagcagaagtaaagtacatagttgagaaggaaggatcattcaccattaatcgcttggaaacatcaagagttcattggaatgcttctaataatgagaagaatggtggttacaatgtgtcaaggtgcatgagagctgtggctgagcctttgcttgtcagccactttgacaaggaattgatggatttagtgttccacaagtacgaagagattgtttctgattgcatgtccaaagagaatactgagtttataaatgtcatcatctccttgaccaaaataaattaa \ No newline at end of file diff --git a/include/GeneticCode.hpp b/include/GeneticCode.hpp new file mode 100644 index 0000000..7d147e2 --- /dev/null +++ b/include/GeneticCode.hpp @@ -0,0 +1,98 @@ +/** Implement the genetic code as a thread-safe GeneticCode. + * + * @see https://refactoring.guru/design-patterns/GeneticCode/cpp/example#example-1 + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "Sequence.hpp" +#include "strutils.hpp" + + +class GeneticCode { + + +private: + + static GeneticCode *pinstance_; + static std::mutex mutex_; + +protected: + + std::unordered_map codon_table; + GeneticCode(): GeneticCode("./resources/codons.txt") {} + + GeneticCode(std::string input_filename) { + std::ifstream input(input_filename); + if (!input.good()) { + throw std::invalid_argument("Error: could not read codon table from " + input_filename + " file."); + } + load_genetic_code(input); + } + + + void load_genetic_code(std::ifstream &input) { + if (!input.good()) { + throw std::invalid_argument("Error: Could not read the codon table from the text file."); + } + std::string line; + while(input.good()) { + std::getline(input, line); + std::vector fields = split_string(line, '\t'); + std::string codon = fields.at(0); + char amino_acid = fields.at(2)[0]; + GeneticCode::codon_table.insert({codon, amino_acid}); + } + } + + +public: + GeneticCode(GeneticCode &other) = delete; + void operator=(const GeneticCode &) = delete; + static GeneticCode *GetInstance(); + static GeneticCode *GetInstance(std::string input_filename); + + + char get_amino_acid(std::string codon) { + replace_all(codon, 'U', 'T'); + if (codon_table.find(codon) != codon_table.end()) { + return codon_table.at(codon); + } else { + throw std::invalid_argument("Error: codon '" + codon + "' not found in genetic code"); + } + } +}; + + +GeneticCode* GeneticCode::pinstance_{nullptr}; +std::mutex GeneticCode::mutex_; + +GeneticCode *GeneticCode::GetInstance() +{ + std::lock_guard lock(mutex_); + if (pinstance_ == nullptr) + { + pinstance_ = new GeneticCode(); + } + return pinstance_; +} + +GeneticCode *GeneticCode::GetInstance(std::string input_filename) +{ + std::lock_guard lock(mutex_); + if (pinstance_ == nullptr) + { + pinstance_ = new GeneticCode(input_filename); + } + return pinstance_; +} \ No newline at end of file diff --git a/include/Sequence.hpp b/include/Sequence.hpp new file mode 100644 index 0000000..c83b66f --- /dev/null +++ b/include/Sequence.hpp @@ -0,0 +1,92 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "GeneticCode.hpp" +#include "strutils.hpp" + + +class Sequence{ + +public: + Sequence(std::string sequence) { + this->sequence = sequence; + } + + unsigned int get_length() { + return this->sequence.length(); + } + + std::string get_sequence() { + return this->sequence; + } + +protected: + std::string sequence; +}; + + +class ProteinSequence: public Sequence { + + using Sequence::Sequence; +}; + +class RNASequence: public Sequence { + + using Sequence::Sequence; + +public: + + ProteinSequence translate_to_protein() { + GeneticCode *code = GeneticCode::GetInstance(); + std::string protein_sequence = ""; + for (unsigned int i=0; i < this->sequence.length()-2; i+=3) { + std::string codon = this->sequence.substr(i, 3); + char amino_acid = code->get_amino_acid(codon); + protein_sequence += amino_acid; + } + return ProteinSequence(protein_sequence); + } +}; + +class DNASequence: public Sequence { + + using Sequence::Sequence; +public: + DNASequence get_complement() { + std::string complement = ""; + for (char &c: this->sequence) { + complement += DNASequence::complement(c); + } + return DNASequence(complement); + } + + static char complement(char c) { + if (c == 'A') { + return 'T'; + } else if (c == 'T') { + return 'A'; + } else if (c == 'G') { + return 'C'; + } else if (c == 'C') { + return 'G'; + } else { + throw std::invalid_argument("Error: nucleotide " + std::to_string(c) + " has no complement"); + } + } + + RNASequence transcribe_to_rna() { + std::string rna_sequence = this->sequence.substr(0); + replace_all(rna_sequence, 'T', 'U'); + return RNASequence(rna_sequence); + } + + + +}; + diff --git a/include/fasta.hpp b/include/fasta.hpp new file mode 100644 index 0000000..d8070cb --- /dev/null +++ b/include/fasta.hpp @@ -0,0 +1,54 @@ +/** Simple FASTA file format reader + * + * @author The BioloGeeks Team + * @date 2023-09-30 + */ +#pragma once + +#include +#include +#include + +class Record { +public: + Record(std::string name, std::string seq) { + this->name = name; + this->seq = seq; + } + + std::string name, seq; +}; + +namespace fasta { + + /** + * @see https://www.rosettacode.org/wiki/FASTA_format?section=10#C++ + * + */ + std::vector read_file(std::ifstream &input) { + std::string line, name, content; + std::vector Records; + // Read fasta file lines and append each FASTA records + // to the Records vector. + while (input.good()) { + std::getline(input, line); + if (line[0] == '>') { + if (!name.empty()) { + Record Record(name, content); + Records.push_back(Record); + name.clear(); + content.clear(); + } + name = line.substr(1); + } else { + content += line; + } + } + + if (!name.empty()) { + Record Record(name, content); + Records.push_back(Record); + } + return Records; + } +} \ No newline at end of file diff --git a/include/strutils.hpp b/include/strutils.hpp new file mode 100644 index 0000000..dcfa3a3 --- /dev/null +++ b/include/strutils.hpp @@ -0,0 +1,30 @@ +#pragma once +#include +#include +#include +#include + +void replace_all(std::string &str, char from, char to) { + for (unsigned int i=0; i < str.length(); i++) { + if (str[i] == from) { + str[i] = to; + } + } +} + + +std::vector split_string(std::string &str, char delim) { + std::stringstream sstream(str); + std::vector fields; + std::string item; + while (std::getline(sstream, item, delim)) { + fields.push_back(item); + } + return fields; +} + +void uppercase_string(std::string &str) { + std::transform(str.cbegin(), str.cend(), + str.begin(), // write to the same location + [](unsigned char c) { return std::toupper(c); }); +} \ No newline at end of file diff --git a/resources/codons.txt b/resources/codons.txt new file mode 100644 index 0000000..8601ace --- /dev/null +++ b/resources/codons.txt @@ -0,0 +1,64 @@ +AAA Lys K Lysine +AAC Asn N Asparagine +AAG Lys K Lysine +AAT Asn N Asparagine +ACA Thr T Threonine +ACC Thr T Threonine +ACG Thr T Threonine +ACT Thr T Threonine +AGA Arg R Arginine +AGC Ser S Serine +AGG Arg R Arginine +AGT Ser S Serine +ATA Ile I Isoleucine +ATC Ile I Isoleucine +ATG Met M Methionine +ATT Ile I Isoleucine +CAA Gln Q Glutamine +CAC His H Histidine +CAG Gln Q Glutamine +CAT His H Histidine +CCA Pro P Proline +CCC Pro P Proline +CCG Pro P Proline +CCT Pro P Proline +CGA Arg R Arginine +CGC Arg R Arginine +CGG Arg R Arginine +CGT Arg R Arginine +CTA Leu L Leucine +CTC Leu L Leucine +CTG Leu L Leucine +CTT Leu L Leucine +GAA Glu E Glutamic_acid +GAC Asp D Aspartic_acid +GAG Glu E Glutamic_acid +GAT Asp D Aspartic_acid +GCA Ala A Alanine +GCC Ala A Alanine +GCG Ala A Alanine +GCT Ala A Alanine +GGA Gly G Glycine +GGC Gly G Glycine +GGG Gly G Glycine +GGT Gly G Glycine +GTA Val V Valine +GTC Val V Valine +GTG Val V Valine +GTT Val V Valine +TAA Stp O Stop +TAC Tyr Y Tyrosine +TAG Stp O Stop +TAT Tyr Y Tyrosine +TCA Ser S Serine +TCC Ser S Serine +TCG Ser S Serine +TCT Ser S Serine +TGA Stp O Stop +TGC Cys C Cysteine +TGG Trp W Tryptophan +TGT Cys C Cysteine +TTA Leu L Leucine +TTC Phe F Phenylalanine +TTG Leu L Leucine +TTT Phe F Phenylalanine \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 0000000..84968a7 --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,53 @@ +/** Basic DNA sequence + */ +#include +#include +#include +#include +#include +#include + +#include "../include/Sequence.hpp" +#include "../include/GeneticCode.hpp" +#include "../include/fasta.hpp" + +int main(int argc, char *argv[]) { + std::string codon_filename = "resources/codons.txt"; + std::string fasta_filename = ""; + int opt; + while ((opt = getopt (argc, argv, "i:c:h")) != -1) { + switch(opt) { + case 'i': + fasta_filename = optarg; + break; + case 'c': + codon_filename = optarg; + break; + case '?': + case 'h': + default: + std::cout << "Usage: " << argv[0] << "-i " << std::endl; + break; + case -1: + break; + } + } + if (fasta_filename.length() == 0) { + std::cerr << "Error: no fasta file given" << std::endl; + return EXIT_FAILURE; + } + + GeneticCode::GetInstance(codon_filename); + std::ifstream fasta_ifstream(fasta_filename); + std::vector fasta_records = fasta::read_file(fasta_ifstream); + Record fasta_record(fasta_records.at(0)); + std::string sequence(fasta_record.seq); + uppercase_string(sequence); + DNASequence dna_sequence(sequence); + RNASequence rna_sequence = dna_sequence.transcribe_to_rna(); + ProteinSequence protein_sequence = rna_sequence.translate_to_protein(); + std::cout << dna_sequence.get_sequence() << std::endl; + std::cout << rna_sequence.get_sequence() << std::endl; + std::cout << protein_sequence.get_sequence() << std::endl; + return EXIT_SUCCESS; +} \ No newline at end of file diff --git a/tests/GeneticCode_test.cpp b/tests/GeneticCode_test.cpp new file mode 100644 index 0000000..e69de29 diff --git a/tests/Sequence_test.cpp b/tests/Sequence_test.cpp new file mode 100644 index 0000000..3c0e152 --- /dev/null +++ b/tests/Sequence_test.cpp @@ -0,0 +1,18 @@ +#include + +#include "../include/Sequence.hpp" + +TEST(TranscriptionTest, BasicTranscription) { + std::string source = "ATG"; + std::string target = "AUG"; + std::string result = DNASequence(source).transcribe_to_rna().get_sequence(); + EXPECT_EQ(target, result); +} + +TEST(TranslationTest, BasicTranslation) { + GeneticCode::GetInstance("../resources/codons.txt"); + std::string dna_source = "ATGATG"; + std::string aa_target = "MM"; + std::string result = DNASequence(dna_source).transcribe_to_rna().translate_to_protein().get_sequence(); + EXPECT_EQ(aa_target, result); +}