27 changed files with 23 additions and 1916 deletions
--- a/notebook.org
+++ b/notebook.org
--- a/rust/pairs/Cargo.toml
+++ b/rust/pairs/Cargo.toml
@ -1,6 +0,0 @@
 [package]
 name = "pairs"
 version = "0.1.0"
 edition = "2021"
 [dependencies]
--- a/rust/pairs/src/main.rs
+++ b/rust/pairs/src/main.rs
@ -1,31 +0,0 @@
 fn print_gene_pairs(genes: &Vec<String>) {
    for i in 0..(genes.len()-1) {
        for j in (i+1)..(genes.len()) {
            println!("{}\t{}", genes[i], genes[j]);
        }
    }
 }
 fn main() {
    let mut family_index: String = "".to_string();
    let mut family_genes: Vec<String> = Vec::new();
    for line in std::io::stdin().lines() {
        let line = line.unwrap();
        let mut parts = line.split("\t");
        let gene = parts.next().unwrap().to_string();
        let family = parts.next().unwrap().to_string();
        println!("{}", family);
        if family != family_index {
            family_index = family;
            if family_genes.len() > 1 {
                print_gene_pairs(&family_genes);
            }                    
            family_genes.clear();
        }
        family_genes.push(gene);
    }
    if family_genes.len() > 1 {
        print_gene_pairs(&family_genes);
    }
 }
--- a/rust/tagfinder/.gitignore
+++ b/rust/tagfinder/.gitignore
@ -1,14 +0,0 @@
 # Generated by Cargo
 # will have compiled files and executables
 debug/
 target/
 # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
 # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 Cargo.lock
 # These are backup files generated by rustfmt
 **/*.rs.bk
 # MSVC Windows builds of rustc generate these, which store debugging information
 *.pdb
--- a/rust/tagfinder/Cargo.lock
+++ b/rust/tagfinder/Cargo.lock
@ -1,237 +0,0 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
 version = 3
 [[package]]
 name = "anstream"
 version = "0.6.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "23a1e53f0f5d86382dafe1cf314783b2044280f406e7e1506368220ad11b1338"
 dependencies = [
 "anstyle",
 "anstyle-parse",
 "anstyle-query",
 "anstyle-wincon",
 "colorchoice",
 "is_terminal_polyfill",
 "utf8parse",
 ]
 [[package]]
 name = "anstyle"
 version = "1.0.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
 [[package]]
 name = "anstyle-parse"
 version = "0.2.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
 dependencies = [
 "utf8parse",
 ]
 [[package]]
 name = "anstyle-query"
 version = "1.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
 dependencies = [
 "windows-sys",
 ]
 [[package]]
 name = "anstyle-wincon"
 version = "3.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125"
 dependencies = [
 "anstyle",
 "windows-sys",
 ]
 [[package]]
 name = "clap"
 version = "4.5.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8"
 dependencies = [
 "clap_builder",
 "clap_derive",
 ]
 [[package]]
 name = "clap_builder"
 version = "4.5.20"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54"
 dependencies = [
 "anstream",
 "anstyle",
 "clap_lex",
 "strsim",
 ]
 [[package]]
 name = "clap_derive"
 version = "4.5.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
 dependencies = [
 "heck",
 "proc-macro2",
 "quote",
 "syn",
 ]
 [[package]]
 name = "clap_lex"
 version = "0.7.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
 [[package]]
 name = "colorchoice"
 version = "1.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
 [[package]]
 name = "heck"
 version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
 [[package]]
 name = "is_terminal_polyfill"
 version = "1.70.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
 [[package]]
 name = "proc-macro2"
 version = "1.0.89"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e"
 dependencies = [
 "unicode-ident",
 ]
 [[package]]
 name = "quote"
 version = "1.0.37"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
 dependencies = [
 "proc-macro2",
 ]
 [[package]]
 name = "strsim"
 version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
 [[package]]
 name = "syn"
 version = "2.0.87"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d"
 dependencies = [
 "proc-macro2",
 "quote",
 "unicode-ident",
 ]
 [[package]]
 name = "tagfinder"
 version = "0.1.0"
 dependencies = [
 "clap",
 ]
 [[package]]
 name = "unicode-ident"
 version = "1.0.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
 [[package]]
 name = "utf8parse"
 version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
 [[package]]
 name = "windows-sys"
 version = "0.59.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
 dependencies = [
 "windows-targets",
 ]
 [[package]]
 name = "windows-targets"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
 dependencies = [
 "windows_aarch64_gnullvm",
 "windows_aarch64_msvc",
 "windows_i686_gnu",
 "windows_i686_gnullvm",
 "windows_i686_msvc",
 "windows_x86_64_gnu",
 "windows_x86_64_gnullvm",
 "windows_x86_64_msvc",
 ]
 [[package]]
 name = "windows_aarch64_gnullvm"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
 [[package]]
 name = "windows_aarch64_msvc"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
 [[package]]
 name = "windows_i686_gnu"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
 [[package]]
 name = "windows_i686_gnullvm"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
 [[package]]
 name = "windows_i686_msvc"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
 [[package]]
 name = "windows_x86_64_gnu"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
 [[package]]
 name = "windows_x86_64_gnullvm"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
 [[package]]
 name = "windows_x86_64_msvc"
 version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
--- a/rust/tagfinder/Cargo.toml
+++ b/rust/tagfinder/Cargo.toml
@ -1,7 +0,0 @@
 [package]
 name = "tagfinder"
 version = "0.1.0"
 edition = "2021"
 [dependencies]
 clap = { version = "4.5.20", features = ["derive"] }
--- a/rust/tagfinder/Makefile
+++ b/rust/tagfinder/Makefile
@ -1,2 +0,0 @@
 run:
 	cargo run -- --families test/samples/families.tsv --positions test/samples/position.tsv --definitions 0,1
--- a/rust/tagfinder/src/finder.rs
+++ b/rust/tagfinder/src/finder.rs
@ -1,135 +0,0 @@
 use std::cmp::Ordering;
 use std::collections::HashMap;
 use std::fs::File;
 use std::io::{self, BufRead};
 use std::path::Path;
 fn read_lines<P>(filename: P) -> io::Result<io::Lines<io::BufReader<File>>>
 where
    P: AsRef<Path>,
 {
    let file = File::open(filename)?;
    Ok(io::BufReader::new(file).lines())
 }
 pub fn parse_families<P>(filename: P) -> HashMap<String, u64>
 where
    P: AsRef<Path>,
 {
    let mut map: HashMap<String, u64> = HashMap::new();
    if let Ok(lines) = read_lines(filename) {
        for line in lines.flatten() {
            let parts: Vec<&str> = line.split("\t").collect();
            let gene = parts[0];
            let family: u64 = parts[1].parse().unwrap();
            map.insert(gene.to_string(), family);
        }
    }
    map
 }
 pub fn parse_position<P>(filename: P) -> Vec<(String, String, u64, u64)>
 where
    P: AsRef<Path>,
 {
    let mut vec: Vec<(String, String, u64, u64)> = Vec::new();
    if let Ok(lines) = read_lines(filename) {
        for line in lines.flatten() {
            let parts: Vec<&str> = line.split("\t").collect();
            let gene = parts[0];
            let chromosome = parts[1];
            let start: u64 = parts[2].parse().unwrap();
            let end: u64 = parts[3].parse().unwrap();
            vec.push((gene.to_string(), chromosome.to_string(), start, end));
        }
    }
    vec
 }
 pub fn compare_gene_position(
    position_a: &(String, String, u64, u64),
    position_b: &(String, String, u64, u64),
 ) -> Ordering {
    let (_gene_a, chromosome_a, start_a, _end_b) = position_a;
    let (_gene_b, chromosome_b, start_b, _end_b) = position_b;
    if chromosome_a < chromosome_b {
        return Ordering::Less;
    } else if chromosome_a > chromosome_b {
        return Ordering::Greater;
    } else {
        if start_a < start_b {
            return Ordering::Less;
        } else {
            return Ordering::Greater;
        }
    }
 }
 pub fn detect_tag_of_definitions(
    definitions: &Vec<u64>,
    gene_positions: &Vec<(String, String, u64, u64)>,
    gene_families: &HashMap<String, u64>,
 ) -> HashMap<String, (String, Vec<Option<u64>>)> {
    let mut tag_numbers: HashMap<String, (String, Vec<Option<u64>>)> = HashMap::new();
    for definition in definitions {
        let tag_numbers_def = detect_tag_of_definition(*definition, gene_positions, gene_families);
        for (gene, (family, tag)) in tag_numbers_def {
            if tag_numbers.contains_key(&gene) {
                let (_gene, tags) = tag_numbers.get_mut(&gene).unwrap();
                tags.push(tag);
            } else {
                tag_numbers.insert(gene, (family, vec![tag]));
            }
        }
    }
    tag_numbers
 }
 pub fn detect_tag_of_definition(
    definition: u64,
    gene_positions: &Vec<(String, String, u64, u64)>,
    gene_families: &HashMap<String, u64>,
 ) -> HashMap<String, (String, Option<u64>)> {
    let mut tag_numbers: HashMap<String, (String, Option<u64>)> = HashMap::new();
    let mut spacer_index: u64 = 0;
    let mut tag_index: u64 = 0;
    for (i, (gene_i, chromosome_i, _start_i, _end_i)) in gene_positions.iter().enumerate() {
        let family_repr: String;
        let mut is_on_tag: bool = false;
        if let Some(family_i) = gene_families.get(gene_i) {
            family_repr = format!("{}", family_i);
            let mut nb_intra_tag_spacers: u64 = 0;
            if !tag_numbers.contains_key(gene_i) {
                for (gene_j, chromosome_j, _start_j, _end_j) in &gene_positions[(i + 1)..] {
                    if *chromosome_i != *chromosome_j {
                        break;
                    }
                    if let Some(family_j) = gene_families.get(gene_j) {
                        if *family_i == *family_j {
                            nb_intra_tag_spacers = 0;
                            if !is_on_tag {
                                is_on_tag = true;
                                tag_index += 1;
                                tag_numbers.insert(gene_i.clone(), (family_repr.clone(), Some(tag_index)));
                            }
                            tag_numbers.insert(gene_j.clone(), (family_repr.clone(), Some(tag_index)));
                        }
                    } else {
                        nb_intra_tag_spacers += 1;
                    }
                    if nb_intra_tag_spacers > definition {
                        break;
                    }
                }
                if !is_on_tag {
                    tag_numbers.insert(gene_i.clone(), (family_repr, None));
                }
            }
        } else {
            spacer_index += 1;
            family_repr = format!("spacer{}", spacer_index);
            tag_numbers.insert(gene_i.clone(), (family_repr, None));
        }
    }
    tag_numbers
 }
--- a/rust/tagfinder/src/main.rs
+++ b/rust/tagfinder/src/main.rs
@ -1,57 +0,0 @@
 use std::{collections::HashMap, path::PathBuf};
 use clap::Parser;
 use finder::{detect_tag_of_definitions, parse_families, parse_position};
 mod finder;
 #[derive(Parser)]
 #[command(version="0.0.1", about="Find Tandemly Arrayed Genes", long_about="Find Tandemly Arrayed Genes in a genome based on gene position and Tandemly Arrayed Genes definition (i. e. number of spacers)")]
 struct Cli {
    #[arg(short='f', long="families", value_name="FAMILIES FILE")]
    families: PathBuf,
    #[arg(short='p', long="positions", value_name="POSITIONS FILE")]
    positions: PathBuf,
    #[arg(short='d', long="definitions", value_name="TAGs DEFINITIONS", help="Comma seperated integers for the number of spacers to consider")]
    definitions: String
 }
 fn main() {
    let cli = Cli::parse();
    let gene_families: HashMap<String, u64> = parse_families(cli.families);
    let mut gene_positions: Vec<(String, String, u64, u64)> = parse_position(cli.positions);
    gene_positions.sort_by(|a, b | crate::finder::compare_gene_position(a, b));
    let definitions: Vec<u64> = cli.definitions.split(",").map(|x| x.parse().unwrap()).collect();
    let tag_numbers = detect_tag_of_definitions(&definitions, &gene_positions, &gene_families);
    // Print header
    print!("gene\tfamily\t");
    for (i, definition) in definitions.iter().enumerate() {
        print!("tag{}", definition);
        if (i + 1) < definitions.len() {
            print!("\t");
        }
    }
    print!("\n");
    for (gene, (family, tags)) in tag_numbers {
        print!("{}\t{}\t", gene, family);
        for (i, tag) in tags.iter().enumerate() {
            if let Some(tag) = tag {
                print!("{}", tag);
            } else {
                print!("-");
            }
            if (i + 1) < tags.len() {
                print!("\t");
            }
        }
        print!("\n");
    }
 }
--- a/rust/tagfinder/test/samples/families.tsv
+++ b/rust/tagfinder/test/samples/families.tsv
@ -1,4 +0,0 @@
 A	1
 B	1
 C	2
 D	2
--- a/rust/tagfinder/test/samples/position.tsv
+++ b/rust/tagfinder/test/samples/position.tsv
@ -1,6 +0,0 @@
 A	1	1	1
 B	1	2	1
 C	2	3	1
 U	2	4	1
 V	2	5	1
 D	2	10	1
--- a/workflow/KaKs/conda/clustalw.yaml
+++ b/workflow/KaKs/conda/clustalw.yaml
@ -1,6 +0,0 @@
 name: clustalw
 channels:
  - conda-forge
  - bioconda
 dependencies:
  - bioconda::clustalw
--- a/workflow/KaKs/conda/pal2nal.yaml
+++ b/workflow/KaKs/conda/pal2nal.yaml
@ -1,6 +0,0 @@
 name: pal2nal
 channels:
  - conda-forge
  - bioconda
 dependencies:
  - bioconda::pal2nal
--- a/workflow/KaKs/conda/paml.yaml
+++ b/workflow/KaKs/conda/paml.yaml
@ -1,6 +0,0 @@
 name: paml
 channels:
  - conda-forge
  - bioconda
 dependencies:
  - bioconda::paml
--- a/workflow/KaKs/main.nf
+++ b/workflow/KaKs/main.nf
@ -1,39 +0,0 @@
 /** Compute Ka, Ks values for each
 /** pair of duplicate genes in a TAG
 /** and in a gene family
 /** */
 include { GUNZIP as GUNZIP_CDS } from "./modules/gunzip.nf"
 include { KA_KS } from "./modules/ka_ks.nf"
 process GENE_PAIRS {
    input:
    path families
    output:
    path 'pairs'
    script:
    """
    cat "${families}" | ${baseDir}/../../rust/pairs/target/release/pairs > 'pairs'
    """
 }
 workflow {
    families = params.families
    cds_gz = params.cds
    proteome = params.proteome
    cds = GUNZIP_CDS(cds_gz)
    pairs = GENE_PAIRS(families)
            .splitCsv(sep: '\t', header: false)
            .map { row -> tuple(row[0], row[1]) }
    kaks = KA_KS(pairs, proteome, cds)
    // Save to a CSV with collectFile
    header = Channel.value("gene_id_1\tgene_id_2\tKa\tKs")
    header.concat( kaks )
        .collectFile( name: 'test.txt', newLine: true, sort: false )
        .view() 
 }
--- a/workflow/KaKs/modules/gunzip.nf
+++ b/workflow/KaKs/modules/gunzip.nf
@ -1,12 +0,0 @@
 process GUNZIP {
    input:
    path zipped_file
    output:
    path zipped_file.baseName
    script:
    """
    gunzip -c "${zipped_file}" > "${zipped_file.baseName}"
    """
 }
--- a/workflow/KaKs/modules/ka_ks.nf
+++ b/workflow/KaKs/modules/ka_ks.nf
@ -1,132 +0,0 @@
 process EXTRACT_TWO_PROTEINS {
    input:
    tuple(val(gene_id_1), val(gene_id_2))
    path fasta_file
    output:
    path "${gene_id_1}_${gene_id_2}.prot.fst"
    script:
    output_file="${gene_id_1}_${gene_id_2}.prot.fst"
    """
    bash "${baseDir}/scripts/extract_fasta_records.sh" "${gene_id_1}" "${gene_id_2}" "${fasta_file}" "${output_file}"
     """
 }
 process EXTRACT_TWO_CDS {
    input:
    tuple(val(gene_id_1), val(gene_id_2))
    path fasta_file
    output:
    path "${gene_id_1}_${gene_id_2}.cds.fst"
    script:
    output_file="${gene_id_1}_${gene_id_2}.cds.fst"
    """
    bash "${baseDir}/scripts/extract_fasta_records.sh" "${gene_id_1}" "${gene_id_2}" "${fasta_file}" "${output_file}"
    """
 }
 process CLUSTALW2 {
    label 'clustalw'
    input:
    path protein_sequence
    output:
    path "${protein_sequence.simpleName}.prot.ali.aln"
    script:
    """
    clustalw2 -quiet -align -infile="${protein_sequence}" -outfile="${protein_sequence.simpleName}.prot.ali.aln"
    """
 }
 process PAL2NAL {
    label 'pal2nal'
    input:
    path protein_alignment
    path coding_sequence
    output:
    path "${protein_alignment.simpleName}.cds.ali.phy"
    script:
    """
    pal2nal.pl "${protein_alignment}" "${coding_sequence}" -output paml > "${protein_alignment.simpleName}.cds.ali.phy"
    """
 }
 process YN00 {
    label 'paml'
    input:
    path phylip_file
    output:
    path "${phylip_file.simpleName}.yn"
    script:
    """
    echo "seqfile = ${phylip_file} \noutfile = ${phylip_file.simpleName}.yn \nverbose = 0\nicode = 0\nweighting = 0\ncommonf3x4 = 0" > yn00.ctl
    yn00
    """
 }
 process EXTRACT_KA_KS {
    input:
    tuple(val(gene_id_1), val(gene_id_2))
    path yn_file
    output:
    path 'csv_row'
    script:
    """
    KaKs=\$(awk '
    BEGIN {
          OFS=","
          on_good_section=0
          skip=0
    }
    \$1 == "(B)" {
         skip=8
         on_good_section=1
    }
    on_good_section == 1 {
        if (skip == 0) {
            Ka=\$8
            Ks=\$11
            print Ka, Ks
            exit
        } else {
            skip -= 1
        }
    }
    ' "${yn_file}")
    arr=(\${KaKs//,/ })
    Ka=\${arr[0]}
    Ks=\${arr[1]}
    echo "${gene_id_1}\t${gene_id_2}\t\${Ka}\t\${Ks}" > csv_row
    """
 }
 workflow KA_KS {
    take:
    gene_id_pair
    proteome_fasta
    cds_fasta
    main:
    protein_sequences = EXTRACT_TWO_PROTEINS(gene_id_pair, proteome_fasta)
    cds_sequences = EXTRACT_TWO_CDS(gene_id_pair, cds_fasta)
    protein_alignment = CLUSTALW2(protein_sequences)
    phylip = PAL2NAL(protein_alignment, cds_sequences)
    yn = YN00(phylip)
    kaks = EXTRACT_KA_KS(gene_id_pair, yn)
    emit:
    kaks = kaks
 }
--- a/workflow/KaKs/scripts/extract_fasta_records.sh
+++ b/workflow/KaKs/scripts/extract_fasta_records.sh
@ -1,26 +0,0 @@
 #!/usr/bin/env sh
 gene_id_1="${1}"
 gene_id_2="${2}"
 fasta_file="${3}"
 output_file="${4}"
 echo "" > "${output_file}"
 for gene_id in ${gene_id_1} ${gene_id_2}; do
    awk -v gene_id="${gene_id}" '
        BEGIN {
            on_gene=0
        }
        /^[^>]/ && on_gene == 1 {
            print $0
        }
        /^>/ {
            gene = $4
            gsub("gene:", "", gene)
            if (gene == gene_id) {
                on_gene=1
                print $0
            } else {
                on_gene=0
            }
        }
        ' "${fasta_file}" >> "${output_file}"
 done
--- a/workflow/KaKs/ugly.sh
+++ b/workflow/KaKs/ugly.sh
@ -1,76 +0,0 @@
 #!/usr/bin/env bash
 # Bash version of the Nextflow script that should run faster
 output_file="Glycine_max_duplicate_genes_KaKs.tsv"
 proteome_fasta="/home/sortion/Documents/course/master/M2/S1/comparative_genomics/project/tmp/proteome_filtered.fa"
 cds_fasta="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.cds.all.fa"
 set -euo pipefail
 mkdir -p tmp
 echo "seqfile = ./tmp/cds.ali.phy
 outfile = ./tmp/yn
 verbose = 0
 icode = 0
 weighting = 0
 commonf3x4 = 0" > yn00.ctl
 compute_kaks() {
    local gene_id_1
    local gene_id_2
    gene_id_1=$1
    gene_id_2=$2
    # Extract the proteins
    ./scripts/extract_fasta_records.sh $gene_id_1 $gene_id_2 ${proteome_fasta} "./tmp/prot.fst"
    # Extract the CDS
    ./scripts/extract_fasta_records.sh $gene_id_1 $gene_id_2 ${cds_fasta} "./tmp/cds.fst"
    # Run clustalw2
    clustalw2 -quiet -align -infile="./tmp/prot.fst" -outfile="./tmp/prot.ali.aln"
    # Run Pal2Nal
    pal2nal.pl "./tmp/prot.ali.aln" "./tmp/cds.fst" -output paml > "./tmp/cds.ali.phy"
    # Run yn00
    yn00
    # Extract Ka/Ks
    awk '
    BEGIN {
          on_good_section=0
          skip=0
    }
    $1 == "(B)" {
         skip=8
         on_good_section=1
    }
    on_good_section == 1 {
        if (skip == 0) {
            Ka=$8
            Ks=$11
            print Ka, Ks
            exit
        } else {
            skip -= 1
        }
    }
    ' "./tmp/yn"
 }
 echo "gene_a,gene_b,ka,ks" > "${output_file}"
 input="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/Glycine_max_duplicate_gene_pairs.tsv"
 n=$(wc -l "${input}")
 i=0
 while IFS= read -r line
 do
    echo $i / $n
    gene_a=$(echo $line | awk '{print $1}')
    gene_b=$(echo $line | awk '{print $2}')
    i=$(expr $i + 1)
    kaks=$(compute_kaks $gene_a $gene_b | tail -1 | awk 'NF == 2')
    if [[ ! -z $kaks ]]
       then
       arr=(${kaks//\t/ })
       echo "${gene_a},${gene_b},${arr[0]},${arr[1]}" >> "${output_file}"
    fi
 done < "${input}"
--- a/workflow/main.nf
+++ b/workflow/main.nf
@ -8,26 +8,18 @@
 nextflow.enable.dsl = 2;
 include { GUNZIP } from "./modules/gunzip.nf"
-include { BLAST_ALL_AGAINST_ALL } from "./modules/blast.nf"
+include { BLAST_MAKEBLASTDB } from "./modules/blast.nf"
 include { BLAST_BLASTP } from "./modules/blast.nf"
 include { FILTER_FASTA } from "./modules/filter_fasta.nf"
 include { FILTER_BLASTP } from "./modules/filter_blastp.nf"
 include { CLUSTERING } from "./modules/clustering.nf"
 process PROTEIN_GENE_MAPPING {
    input:
    path proteome
    output:
    path 'protein_gene.tsv'
 }
 workflow {
    proteome = Channel.fromPath(params.proteome)
    GUNZIP(proteome)
    FILTER_FASTA(GUNZIP.out)
-    BLAST_ALL_AGAINST_ALL(FILTER_FASTA.out.proteome)
+    BLAST_MAKEBLASTDB(params.species, FILTER_FASTA.out.proteome)
-    FILTER_BLASTP(params.min_coverage, params.min_identity, BLAST_ALL_AGAINST_ALL.out, FILTER_FASTA.out.lengths)
+    BLAST_BLASTP(params.species, FILTER_FASTA.out.proteome, BLAST_MAKEBLASTDB.out)
-
+    FILTER_BLASTP(BLAST_BLASTP.out, FILTER_FASTA.out.protein_length)
    CLUSTERING(FILTER_BLASTP.out)
 }
--- a/workflow/modules/blast.nf
+++ b/workflow/modules/blast.nf
@ -34,19 +34,6 @@ process BLAST_BLASTP {
    script:
        """
-    blastp -query "${proteome}" -db 'db' -outfmt '6' -num_threads 7  -out "${species}.all-against-all.blastp.tsv"
+    blastp -query "${proteome}" -db 'db' -outfmt '6'  -out "${species}.all-against-all.blastp.tsv"
        """
 }
 workflow BLAST_ALL_AGAINST_ALL {
    take:
    proteome
    main:
    BLAST_MAKEBLASTDB(params.species, proteome)
    BLAST_BLASTP(params.species, proteome, BLAST_MAKEBLASTDB.out)
    emit:
    BLAST_BLASTP.out
 }
--- a/workflow/modules/clustering.nf
+++ b/workflow/modules/clustering.nf
@ -46,7 +46,7 @@ workflow CLUSTERING {
    main:
    BLASTP_TO_ABC(blastp_tsv)
-    MCL(BLASTP_TO_ABC.out)
+    MCL(BLASTP_TO_ABC).out
    MCL_TO_TSV(MCL.out)
    emit:
--- a/workflow/modules/filter_blastp.nf
+++ b/workflow/modules/filter_blastp.nf
@ -1,7 +1,6 @@
 /** Filter blastp output based on coverage and identity percentage
 /**/
 process FILTER_BLASTP {
    input:
@ -9,21 +8,18 @@ process FILTER_BLASTP {
    val min_identity
    path blastp
    path protein_length
    path protein_gene
    output:
    path 'filtered_blastp.tsv'
    script:
    """
-    sort -k 1 "${blastp}" > blastp_s
+    sort "${blastp}" > blastp_s
-    sort -k 1 "${protein_length}" > protein_length_s
+    sort "${protein_length}" > protein_length_s
-    join -1 1 -2 1 -t'\t' blastp_s protein_length_s > join1.tsv
+    join -1 1 -2 1 "blastp_s" "protein_length_s" > join1.tsv
-    sort -k 2 join1.tsv > join1.tsv_s
+    sort -k 2 "join1.tsv" > join1.tsv_s
-    join -1 2 -2 1 -t'\t' join1.tsv_s' 'protein_length_s' > 'joined.blastp.tsv'
+    join -1 2 -2 1 "join1.tsv_s" "protein_length_s" > 'joined.blastp.tsv'
-    awk -f "${baseDir}/scripts/filter_blastp.awk" \
+    awk -f "${baseDir}/scripts/filter_blastp.awk" -v coverage="${min_coverage}" -v identity "${min_identity}" \
        -v coverage="${min_coverage}" \
        -v identity="${min_identity}" \
        "${blastp}" > 'filtered_blastp.tsv'
    """
 }
--- a/workflow/scripts/filter_blastp_sequence_id.awk
+++ b/workflow/scripts/filter_blastp_sequence_id.awk
@ -1,21 +0,0 @@
 #!/usr/bin/env -S awk -f
 # Filter BLASTp format 6 file
 # to keep only the records
 # with ID in records.list
 #
 # Usage:
 # awk -f filter_blastp_sequence_id.awk \
 # records.list records.blastp.tsv
 NR == FNR {
    sequence_id = $1
    to_remove[sequence_id] = 1
    next
 }
 {
    sequence_id = $1
    if (!(sequence_id in to_remove)) {
        print $0
    }
 }
--- a/workflow/scripts/keep_heaviest_edge_abc.awk
+++ b/workflow/scripts/keep_heaviest_edge_abc.awk
@ -1,30 +0,0 @@
 #!/usr/bin/env -S awk -f
 # Usage: $0 file.abc > filtered_file.abc
 BEGIN {
    OFS = FS = "\t"
 }
 {
    a=$1
    b=$2
    c=$3
    if (a > b) {
        tmp=a
        a=b
        b=tmp
    }
    if (!(b in graph[a])) {
        graph[a][b] = c
    } else {
        if (graph[a][b] < c) {
            graph[a][b] = c
        }
    }
 }
 END {
    for (a in graph) {
        for (b in graph[a]) {
            print a, b, graph[a][b]
        }
    }
 }
--- a/workflow/scripts/mcl_to_tsv.awk
+++ b/workflow/scripts/mcl_to_tsv.awk
@ -5,12 +5,12 @@
 BEGIN {
    family_identifier=0
    OFS="\t"
    FS=" "
 }
 {
    family_identifier++
-    for (i=1; i <= NF; i++) {
+    split($0, gene_list, " ")
-        print $i, family_identifier
+    for (gene in gene_list) {
        print gene, family_identifier
    }
 }
--- a/workflow/scripts/protein_lengths.awk
+++ b/workflow/scripts/protein_lengths.awk
@ -1,25 +1,19 @@
 #!/usr/bin/env -S awk -f
-# Associate a isoform id
+# Associate a isoform id to the length of the sequence
 # to the length of the sequence
 # and the associated gene id
 BEGIN {
    sequence_length=0
    isoform_id=""
    gene_id=""
    OFS="\t"
 }
 /^>/ {
    if (isoform_id != "") {
-        print isoform_id, sequence_length, gene_id
+        print isoform_id, sequence_length
    } else {
        isoform_id = $1
        gsub(">", "", isoform_id)
        sequence_length = 0
    }
    isoform_id = $1
    gene_id = $4
    gsub(">", "", isoform_id)
    gsub("gene:", "", gene_id)
    sequence_length = 0
 }
 /^[^>]/ {
@ -28,6 +22,6 @@ BEGIN {
 END {
    if (isoform_id != "") {
-        print isoform_id, sequence_length, gene_id
+        print isoform_id, sequence_length
    }
 }
		`@ -1,2 +0,0 @@`
			`run:`
			`cargo run -- --families test/samples/families.tsv --positions test/samples/position.tsv --definitions 0,1`