Rust tagfinder
This commit is contained in:
parent
53f6a60943
commit
13f872cf10
|
@ -0,0 +1,237 @@
|
||||||
|
# This file is automatically @generated by Cargo.
|
||||||
|
# It is not intended for manual editing.
|
||||||
|
version = 3
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstream"
|
||||||
|
version = "0.6.17"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "23a1e53f0f5d86382dafe1cf314783b2044280f406e7e1506368220ad11b1338"
|
||||||
|
dependencies = [
|
||||||
|
"anstyle",
|
||||||
|
"anstyle-parse",
|
||||||
|
"anstyle-query",
|
||||||
|
"anstyle-wincon",
|
||||||
|
"colorchoice",
|
||||||
|
"is_terminal_polyfill",
|
||||||
|
"utf8parse",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstyle"
|
||||||
|
version = "1.0.10"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstyle-parse"
|
||||||
|
version = "0.2.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
|
||||||
|
dependencies = [
|
||||||
|
"utf8parse",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstyle-query"
|
||||||
|
version = "1.1.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
|
||||||
|
dependencies = [
|
||||||
|
"windows-sys",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anstyle-wincon"
|
||||||
|
version = "3.0.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125"
|
||||||
|
dependencies = [
|
||||||
|
"anstyle",
|
||||||
|
"windows-sys",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "clap"
|
||||||
|
version = "4.5.20"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8"
|
||||||
|
dependencies = [
|
||||||
|
"clap_builder",
|
||||||
|
"clap_derive",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "clap_builder"
|
||||||
|
version = "4.5.20"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54"
|
||||||
|
dependencies = [
|
||||||
|
"anstream",
|
||||||
|
"anstyle",
|
||||||
|
"clap_lex",
|
||||||
|
"strsim",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "clap_derive"
|
||||||
|
version = "4.5.18"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
|
||||||
|
dependencies = [
|
||||||
|
"heck",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "clap_lex"
|
||||||
|
version = "0.7.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "colorchoice"
|
||||||
|
version = "1.0.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "heck"
|
||||||
|
version = "0.5.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "is_terminal_polyfill"
|
||||||
|
version = "1.70.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "proc-macro2"
|
||||||
|
version = "1.0.89"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e"
|
||||||
|
dependencies = [
|
||||||
|
"unicode-ident",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "quote"
|
||||||
|
version = "1.0.37"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "strsim"
|
||||||
|
version = "0.11.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "syn"
|
||||||
|
version = "2.0.87"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"unicode-ident",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tagfinder"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"clap",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unicode-ident"
|
||||||
|
version = "1.0.13"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "utf8parse"
|
||||||
|
version = "0.2.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-sys"
|
||||||
|
version = "0.59.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
|
||||||
|
dependencies = [
|
||||||
|
"windows-targets",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows-targets"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
|
||||||
|
dependencies = [
|
||||||
|
"windows_aarch64_gnullvm",
|
||||||
|
"windows_aarch64_msvc",
|
||||||
|
"windows_i686_gnu",
|
||||||
|
"windows_i686_gnullvm",
|
||||||
|
"windows_i686_msvc",
|
||||||
|
"windows_x86_64_gnu",
|
||||||
|
"windows_x86_64_gnullvm",
|
||||||
|
"windows_x86_64_msvc",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_aarch64_gnullvm"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_aarch64_msvc"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_i686_gnu"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_i686_gnullvm"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_i686_msvc"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_x86_64_gnu"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_x86_64_gnullvm"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "windows_x86_64_msvc"
|
||||||
|
version = "0.52.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
|
@ -0,0 +1,7 @@
|
||||||
|
[package]
|
||||||
|
name = "tagfinder"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
clap = { version = "4.5.20", features = ["derive"] }
|
|
@ -0,0 +1,2 @@
|
||||||
|
run:
|
||||||
|
cargo run -- --families test/samples/families.tsv --positions test/samples/position.tsv --definitions 0,1
|
|
@ -0,0 +1,135 @@
|
||||||
|
use std::cmp::Ordering;
|
||||||
|
use std::collections::HashMap;
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::{self, BufRead};
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
fn read_lines<P>(filename: P) -> io::Result<io::Lines<io::BufReader<File>>>
|
||||||
|
where
|
||||||
|
P: AsRef<Path>,
|
||||||
|
{
|
||||||
|
let file = File::open(filename)?;
|
||||||
|
Ok(io::BufReader::new(file).lines())
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn parse_families<P>(filename: P) -> HashMap<String, u64>
|
||||||
|
where
|
||||||
|
P: AsRef<Path>,
|
||||||
|
{
|
||||||
|
let mut map: HashMap<String, u64> = HashMap::new();
|
||||||
|
if let Ok(lines) = read_lines(filename) {
|
||||||
|
for line in lines.flatten() {
|
||||||
|
let parts: Vec<&str> = line.split("\t").collect();
|
||||||
|
let gene = parts[0];
|
||||||
|
let family: u64 = parts[1].parse().unwrap();
|
||||||
|
map.insert(gene.to_string(), family);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
map
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn parse_position<P>(filename: P) -> Vec<(String, String, u64, u64)>
|
||||||
|
where
|
||||||
|
P: AsRef<Path>,
|
||||||
|
{
|
||||||
|
let mut vec: Vec<(String, String, u64, u64)> = Vec::new();
|
||||||
|
if let Ok(lines) = read_lines(filename) {
|
||||||
|
for line in lines.flatten() {
|
||||||
|
let parts: Vec<&str> = line.split("\t").collect();
|
||||||
|
let gene = parts[0];
|
||||||
|
let chromosome = parts[1];
|
||||||
|
let start: u64 = parts[2].parse().unwrap();
|
||||||
|
let end: u64 = parts[3].parse().unwrap();
|
||||||
|
vec.push((gene.to_string(), chromosome.to_string(), start, end));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
vec
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn compare_gene_position(
|
||||||
|
position_a: &(String, String, u64, u64),
|
||||||
|
position_b: &(String, String, u64, u64),
|
||||||
|
) -> Ordering {
|
||||||
|
let (_gene_a, chromosome_a, start_a, _end_b) = position_a;
|
||||||
|
let (_gene_b, chromosome_b, start_b, _end_b) = position_b;
|
||||||
|
if chromosome_a < chromosome_b {
|
||||||
|
return Ordering::Less;
|
||||||
|
} else if chromosome_a > chromosome_b {
|
||||||
|
return Ordering::Greater;
|
||||||
|
} else {
|
||||||
|
if start_a < start_b {
|
||||||
|
return Ordering::Less;
|
||||||
|
} else {
|
||||||
|
return Ordering::Greater;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn detect_tag_of_definitions(
|
||||||
|
definitions: &Vec<u64>,
|
||||||
|
gene_positions: &Vec<(String, String, u64, u64)>,
|
||||||
|
gene_families: &HashMap<String, u64>,
|
||||||
|
) -> HashMap<String, (String, Vec<Option<u64>>)> {
|
||||||
|
let mut tag_numbers: HashMap<String, (String, Vec<Option<u64>>)> = HashMap::new();
|
||||||
|
for definition in definitions {
|
||||||
|
let tag_numbers_def = detect_tag_of_definition(*definition, gene_positions, gene_families);
|
||||||
|
for (gene, (family, tag)) in tag_numbers_def {
|
||||||
|
if tag_numbers.contains_key(&gene) {
|
||||||
|
let (_gene, tags) = tag_numbers.get_mut(&gene).unwrap();
|
||||||
|
tags.push(tag);
|
||||||
|
} else {
|
||||||
|
tag_numbers.insert(gene, (family, vec![tag]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tag_numbers
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn detect_tag_of_definition(
|
||||||
|
definition: u64,
|
||||||
|
gene_positions: &Vec<(String, String, u64, u64)>,
|
||||||
|
gene_families: &HashMap<String, u64>,
|
||||||
|
) -> HashMap<String, (String, Option<u64>)> {
|
||||||
|
let mut tag_numbers: HashMap<String, (String, Option<u64>)> = HashMap::new();
|
||||||
|
let mut spacer_index: u64 = 0;
|
||||||
|
let mut tag_index: u64 = 0;
|
||||||
|
for (i, (gene_i, chromosome_i, _start_i, _end_i)) in gene_positions.iter().enumerate() {
|
||||||
|
let family_repr: String;
|
||||||
|
let mut is_on_tag: bool = false;
|
||||||
|
if let Some(family_i) = gene_families.get(gene_i) {
|
||||||
|
family_repr = format!("{}", family_i);
|
||||||
|
let mut nb_intra_tag_spacers: u64 = 0;
|
||||||
|
if !tag_numbers.contains_key(gene_i) {
|
||||||
|
for (gene_j, chromosome_j, _start_j, _end_j) in &gene_positions[(i + 1)..] {
|
||||||
|
if *chromosome_i != *chromosome_j {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if let Some(family_j) = gene_families.get(gene_j) {
|
||||||
|
if *family_i == *family_j {
|
||||||
|
nb_intra_tag_spacers = 0;
|
||||||
|
if !is_on_tag {
|
||||||
|
is_on_tag = true;
|
||||||
|
tag_index += 1;
|
||||||
|
tag_numbers.insert(gene_i.clone(), (family_repr.clone(), Some(tag_index)));
|
||||||
|
}
|
||||||
|
tag_numbers.insert(gene_j.clone(), (family_repr.clone(), Some(tag_index)));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
nb_intra_tag_spacers += 1;
|
||||||
|
}
|
||||||
|
if nb_intra_tag_spacers > definition {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !is_on_tag {
|
||||||
|
tag_numbers.insert(gene_i.clone(), (family_repr, None));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
spacer_index += 1;
|
||||||
|
family_repr = format!("spacer{}", spacer_index);
|
||||||
|
tag_numbers.insert(gene_i.clone(), (family_repr, None));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tag_numbers
|
||||||
|
}
|
|
@ -0,0 +1,57 @@
|
||||||
|
use std::{collections::HashMap, path::PathBuf};
|
||||||
|
use clap::Parser;
|
||||||
|
use finder::{detect_tag_of_definitions, parse_families, parse_position};
|
||||||
|
|
||||||
|
mod finder;
|
||||||
|
|
||||||
|
#[derive(Parser)]
|
||||||
|
#[command(version="0.0.1", about="Find Tandemly Arrayed Genes", long_about="Find Tandemly Arrayed Genes in a genome based on gene position and Tandemly Arrayed Genes definition (i. e. number of spacers)")]
|
||||||
|
struct Cli {
|
||||||
|
#[arg(short='f', long="families", value_name="FAMILIES FILE")]
|
||||||
|
families: PathBuf,
|
||||||
|
|
||||||
|
#[arg(short='p', long="positions", value_name="POSITIONS FILE")]
|
||||||
|
positions: PathBuf,
|
||||||
|
|
||||||
|
#[arg(short='d', long="definitions", value_name="TAGs DEFINITIONS", help="Comma seperated integers for the number of spacers to consider")]
|
||||||
|
definitions: String
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let cli = Cli::parse();
|
||||||
|
|
||||||
|
let gene_families: HashMap<String, u64> = parse_families(cli.families);
|
||||||
|
|
||||||
|
let mut gene_positions: Vec<(String, String, u64, u64)> = parse_position(cli.positions);
|
||||||
|
|
||||||
|
gene_positions.sort_by(|a, b | crate::finder::compare_gene_position(a, b));
|
||||||
|
|
||||||
|
let definitions: Vec<u64> = cli.definitions.split(",").map(|x| x.parse().unwrap()).collect();
|
||||||
|
|
||||||
|
let tag_numbers = detect_tag_of_definitions(&definitions, &gene_positions, &gene_families);
|
||||||
|
|
||||||
|
// Print header
|
||||||
|
print!("gene\tfamily\t");
|
||||||
|
for (i, definition) in definitions.iter().enumerate() {
|
||||||
|
print!("tag{}", definition);
|
||||||
|
if (i + 1) < definitions.len() {
|
||||||
|
print!("\t");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
print!("\n");
|
||||||
|
|
||||||
|
for (gene, (family, tags)) in tag_numbers {
|
||||||
|
print!("{}\t{}\t", gene, family);
|
||||||
|
for (i, tag) in tags.iter().enumerate() {
|
||||||
|
if let Some(tag) = tag {
|
||||||
|
print!("{}", tag);
|
||||||
|
} else {
|
||||||
|
print!("-");
|
||||||
|
}
|
||||||
|
if (i + 1) < tags.len() {
|
||||||
|
print!("\t");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
print!("\n");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,4 @@
|
||||||
|
A 1
|
||||||
|
B 1
|
||||||
|
C 2
|
||||||
|
D 2
|
|
|
@ -0,0 +1,6 @@
|
||||||
|
A 1 1 1
|
||||||
|
B 1 2 1
|
||||||
|
C 2 3 1
|
||||||
|
U 2 4 1
|
||||||
|
V 2 5 1
|
||||||
|
D 2 10 1
|
|
Loading…
Reference in New Issue