From 13f872cf10f8307039f12db23a8f54fe4de6dfe2 Mon Sep 17 00:00:00 2001 From: Samuel Ortion Date: Fri, 8 Nov 2024 12:42:59 +0100 Subject: [PATCH] Rust tagfinder --- rust/tagfinder/Cargo.lock | 237 +++++++++++++++++++++++ rust/tagfinder/Cargo.toml | 7 + rust/tagfinder/Makefile | 2 + rust/tagfinder/src/finder.rs | 135 +++++++++++++ rust/tagfinder/src/main.rs | 57 ++++++ rust/tagfinder/test/samples/families.tsv | 4 + rust/tagfinder/test/samples/position.tsv | 6 + 7 files changed, 448 insertions(+) create mode 100644 rust/tagfinder/Cargo.lock create mode 100644 rust/tagfinder/Cargo.toml create mode 100644 rust/tagfinder/Makefile create mode 100644 rust/tagfinder/src/finder.rs create mode 100644 rust/tagfinder/src/main.rs create mode 100644 rust/tagfinder/test/samples/families.tsv create mode 100644 rust/tagfinder/test/samples/position.tsv diff --git a/rust/tagfinder/Cargo.lock b/rust/tagfinder/Cargo.lock new file mode 100644 index 0000000..4de8dd6 --- /dev/null +++ b/rust/tagfinder/Cargo.lock @@ -0,0 +1,237 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "anstream" +version = "0.6.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23a1e53f0f5d86382dafe1cf314783b2044280f406e7e1506368220ad11b1338" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "anstyle-parse" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125" +dependencies = [ + "anstyle", + "windows-sys", +] + +[[package]] +name = "clap" +version = "4.5.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" + +[[package]] +name = "colorchoice" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + +[[package]] +name = "proc-macro2" +version = "1.0.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "syn" +version = "2.0.87" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tagfinder" +version = "0.1.0" +dependencies = [ + "clap", +] + +[[package]] +name = "unicode-ident" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/rust/tagfinder/Cargo.toml b/rust/tagfinder/Cargo.toml new file mode 100644 index 0000000..97eff0f --- /dev/null +++ b/rust/tagfinder/Cargo.toml @@ -0,0 +1,7 @@ +[package] +name = "tagfinder" +version = "0.1.0" +edition = "2021" + +[dependencies] +clap = { version = "4.5.20", features = ["derive"] } diff --git a/rust/tagfinder/Makefile b/rust/tagfinder/Makefile new file mode 100644 index 0000000..34892ff --- /dev/null +++ b/rust/tagfinder/Makefile @@ -0,0 +1,2 @@ +run: + cargo run -- --families test/samples/families.tsv --positions test/samples/position.tsv --definitions 0,1 \ No newline at end of file diff --git a/rust/tagfinder/src/finder.rs b/rust/tagfinder/src/finder.rs new file mode 100644 index 0000000..f7fa48c --- /dev/null +++ b/rust/tagfinder/src/finder.rs @@ -0,0 +1,135 @@ +use std::cmp::Ordering; +use std::collections::HashMap; +use std::fs::File; +use std::io::{self, BufRead}; +use std::path::Path; + +fn read_lines

(filename: P) -> io::Result>> +where + P: AsRef, +{ + let file = File::open(filename)?; + Ok(io::BufReader::new(file).lines()) +} + +pub fn parse_families

(filename: P) -> HashMap +where + P: AsRef, +{ + let mut map: HashMap = HashMap::new(); + if let Ok(lines) = read_lines(filename) { + for line in lines.flatten() { + let parts: Vec<&str> = line.split("\t").collect(); + let gene = parts[0]; + let family: u64 = parts[1].parse().unwrap(); + map.insert(gene.to_string(), family); + } + } + map +} + +pub fn parse_position

(filename: P) -> Vec<(String, String, u64, u64)> +where + P: AsRef, +{ + let mut vec: Vec<(String, String, u64, u64)> = Vec::new(); + if let Ok(lines) = read_lines(filename) { + for line in lines.flatten() { + let parts: Vec<&str> = line.split("\t").collect(); + let gene = parts[0]; + let chromosome = parts[1]; + let start: u64 = parts[2].parse().unwrap(); + let end: u64 = parts[3].parse().unwrap(); + vec.push((gene.to_string(), chromosome.to_string(), start, end)); + } + } + vec +} + +pub fn compare_gene_position( + position_a: &(String, String, u64, u64), + position_b: &(String, String, u64, u64), +) -> Ordering { + let (_gene_a, chromosome_a, start_a, _end_b) = position_a; + let (_gene_b, chromosome_b, start_b, _end_b) = position_b; + if chromosome_a < chromosome_b { + return Ordering::Less; + } else if chromosome_a > chromosome_b { + return Ordering::Greater; + } else { + if start_a < start_b { + return Ordering::Less; + } else { + return Ordering::Greater; + } + } +} + +pub fn detect_tag_of_definitions( + definitions: &Vec, + gene_positions: &Vec<(String, String, u64, u64)>, + gene_families: &HashMap, +) -> HashMap>)> { + let mut tag_numbers: HashMap>)> = HashMap::new(); + for definition in definitions { + let tag_numbers_def = detect_tag_of_definition(*definition, gene_positions, gene_families); + for (gene, (family, tag)) in tag_numbers_def { + if tag_numbers.contains_key(&gene) { + let (_gene, tags) = tag_numbers.get_mut(&gene).unwrap(); + tags.push(tag); + } else { + tag_numbers.insert(gene, (family, vec![tag])); + } + } + } + tag_numbers +} + +pub fn detect_tag_of_definition( + definition: u64, + gene_positions: &Vec<(String, String, u64, u64)>, + gene_families: &HashMap, +) -> HashMap)> { + let mut tag_numbers: HashMap)> = HashMap::new(); + let mut spacer_index: u64 = 0; + let mut tag_index: u64 = 0; + for (i, (gene_i, chromosome_i, _start_i, _end_i)) in gene_positions.iter().enumerate() { + let family_repr: String; + let mut is_on_tag: bool = false; + if let Some(family_i) = gene_families.get(gene_i) { + family_repr = format!("{}", family_i); + let mut nb_intra_tag_spacers: u64 = 0; + if !tag_numbers.contains_key(gene_i) { + for (gene_j, chromosome_j, _start_j, _end_j) in &gene_positions[(i + 1)..] { + if *chromosome_i != *chromosome_j { + break; + } + if let Some(family_j) = gene_families.get(gene_j) { + if *family_i == *family_j { + nb_intra_tag_spacers = 0; + if !is_on_tag { + is_on_tag = true; + tag_index += 1; + tag_numbers.insert(gene_i.clone(), (family_repr.clone(), Some(tag_index))); + } + tag_numbers.insert(gene_j.clone(), (family_repr.clone(), Some(tag_index))); + } + } else { + nb_intra_tag_spacers += 1; + } + if nb_intra_tag_spacers > definition { + break; + } + } + if !is_on_tag { + tag_numbers.insert(gene_i.clone(), (family_repr, None)); + } + } + } else { + spacer_index += 1; + family_repr = format!("spacer{}", spacer_index); + tag_numbers.insert(gene_i.clone(), (family_repr, None)); + } + } + tag_numbers +} diff --git a/rust/tagfinder/src/main.rs b/rust/tagfinder/src/main.rs new file mode 100644 index 0000000..73a7068 --- /dev/null +++ b/rust/tagfinder/src/main.rs @@ -0,0 +1,57 @@ +use std::{collections::HashMap, path::PathBuf}; +use clap::Parser; +use finder::{detect_tag_of_definitions, parse_families, parse_position}; + +mod finder; + +#[derive(Parser)] +#[command(version="0.0.1", about="Find Tandemly Arrayed Genes", long_about="Find Tandemly Arrayed Genes in a genome based on gene position and Tandemly Arrayed Genes definition (i. e. number of spacers)")] +struct Cli { + #[arg(short='f', long="families", value_name="FAMILIES FILE")] + families: PathBuf, + + #[arg(short='p', long="positions", value_name="POSITIONS FILE")] + positions: PathBuf, + + #[arg(short='d', long="definitions", value_name="TAGs DEFINITIONS", help="Comma seperated integers for the number of spacers to consider")] + definitions: String +} + +fn main() { + let cli = Cli::parse(); + + let gene_families: HashMap = parse_families(cli.families); + + let mut gene_positions: Vec<(String, String, u64, u64)> = parse_position(cli.positions); + + gene_positions.sort_by(|a, b | crate::finder::compare_gene_position(a, b)); + + let definitions: Vec = cli.definitions.split(",").map(|x| x.parse().unwrap()).collect(); + + let tag_numbers = detect_tag_of_definitions(&definitions, &gene_positions, &gene_families); + + // Print header + print!("gene\tfamily\t"); + for (i, definition) in definitions.iter().enumerate() { + print!("tag{}", definition); + if (i + 1) < definitions.len() { + print!("\t"); + } + } + print!("\n"); + + for (gene, (family, tags)) in tag_numbers { + print!("{}\t{}\t", gene, family); + for (i, tag) in tags.iter().enumerate() { + if let Some(tag) = tag { + print!("{}", tag); + } else { + print!("-"); + } + if (i + 1) < tags.len() { + print!("\t"); + } + } + print!("\n"); + } +} diff --git a/rust/tagfinder/test/samples/families.tsv b/rust/tagfinder/test/samples/families.tsv new file mode 100644 index 0000000..8d12e15 --- /dev/null +++ b/rust/tagfinder/test/samples/families.tsv @@ -0,0 +1,4 @@ +A 1 +B 1 +C 2 +D 2 diff --git a/rust/tagfinder/test/samples/position.tsv b/rust/tagfinder/test/samples/position.tsv new file mode 100644 index 0000000..7f9e0c0 --- /dev/null +++ b/rust/tagfinder/test/samples/position.tsv @@ -0,0 +1,6 @@ +A 1 1 1 +B 1 2 1 +C 2 3 1 +U 2 4 1 +V 2 5 1 +D 2 10 1 \ No newline at end of file