Compare commits

..

No commits in common. "3e3c184153cf384dbc87b05308fa1a8a23347e5b" and "0540e3699eb87b159c0a1072da538f5101a861cd" have entirely different histories.

27 changed files with 23 additions and 1916 deletions

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +0,0 @@
[package]
name = "pairs"
version = "0.1.0"
edition = "2021"
[dependencies]

View File

@ -1,31 +0,0 @@
fn print_gene_pairs(genes: &Vec<String>) {
for i in 0..(genes.len()-1) {
for j in (i+1)..(genes.len()) {
println!("{}\t{}", genes[i], genes[j]);
}
}
}
fn main() {
let mut family_index: String = "".to_string();
let mut family_genes: Vec<String> = Vec::new();
for line in std::io::stdin().lines() {
let line = line.unwrap();
let mut parts = line.split("\t");
let gene = parts.next().unwrap().to_string();
let family = parts.next().unwrap().to_string();
println!("{}", family);
if family != family_index {
family_index = family;
if family_genes.len() > 1 {
print_gene_pairs(&family_genes);
}
family_genes.clear();
}
family_genes.push(gene);
}
if family_genes.len() > 1 {
print_gene_pairs(&family_genes);
}
}

View File

@ -1,14 +0,0 @@
# Generated by Cargo
# will have compiled files and executables
debug/
target/
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
Cargo.lock
# These are backup files generated by rustfmt
**/*.rs.bk
# MSVC Windows builds of rustc generate these, which store debugging information
*.pdb

View File

@ -1,237 +0,0 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "anstream"
version = "0.6.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "23a1e53f0f5d86382dafe1cf314783b2044280f406e7e1506368220ad11b1338"
dependencies = [
"anstyle",
"anstyle-parse",
"anstyle-query",
"anstyle-wincon",
"colorchoice",
"is_terminal_polyfill",
"utf8parse",
]
[[package]]
name = "anstyle"
version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
[[package]]
name = "anstyle-parse"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
dependencies = [
"utf8parse",
]
[[package]]
name = "anstyle-query"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
dependencies = [
"windows-sys",
]
[[package]]
name = "anstyle-wincon"
version = "3.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125"
dependencies = [
"anstyle",
"windows-sys",
]
[[package]]
name = "clap"
version = "4.5.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8"
dependencies = [
"clap_builder",
"clap_derive",
]
[[package]]
name = "clap_builder"
version = "4.5.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54"
dependencies = [
"anstream",
"anstyle",
"clap_lex",
"strsim",
]
[[package]]
name = "clap_derive"
version = "4.5.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
dependencies = [
"heck",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "clap_lex"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
[[package]]
name = "colorchoice"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
[[package]]
name = "heck"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "is_terminal_polyfill"
version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
[[package]]
name = "proc-macro2"
version = "1.0.89"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
dependencies = [
"proc-macro2",
]
[[package]]
name = "strsim"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "syn"
version = "2.0.87"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "tagfinder"
version = "0.1.0"
dependencies = [
"clap",
]
[[package]]
name = "unicode-ident"
version = "1.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
[[package]]
name = "utf8parse"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "windows-sys"
version = "0.59.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-targets"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_gnullvm",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_i686_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"

View File

@ -1,7 +0,0 @@
[package]
name = "tagfinder"
version = "0.1.0"
edition = "2021"
[dependencies]
clap = { version = "4.5.20", features = ["derive"] }

View File

@ -1,2 +0,0 @@
run:
cargo run -- --families test/samples/families.tsv --positions test/samples/position.tsv --definitions 0,1

View File

@ -1,135 +0,0 @@
use std::cmp::Ordering;
use std::collections::HashMap;
use std::fs::File;
use std::io::{self, BufRead};
use std::path::Path;
fn read_lines<P>(filename: P) -> io::Result<io::Lines<io::BufReader<File>>>
where
P: AsRef<Path>,
{
let file = File::open(filename)?;
Ok(io::BufReader::new(file).lines())
}
pub fn parse_families<P>(filename: P) -> HashMap<String, u64>
where
P: AsRef<Path>,
{
let mut map: HashMap<String, u64> = HashMap::new();
if let Ok(lines) = read_lines(filename) {
for line in lines.flatten() {
let parts: Vec<&str> = line.split("\t").collect();
let gene = parts[0];
let family: u64 = parts[1].parse().unwrap();
map.insert(gene.to_string(), family);
}
}
map
}
pub fn parse_position<P>(filename: P) -> Vec<(String, String, u64, u64)>
where
P: AsRef<Path>,
{
let mut vec: Vec<(String, String, u64, u64)> = Vec::new();
if let Ok(lines) = read_lines(filename) {
for line in lines.flatten() {
let parts: Vec<&str> = line.split("\t").collect();
let gene = parts[0];
let chromosome = parts[1];
let start: u64 = parts[2].parse().unwrap();
let end: u64 = parts[3].parse().unwrap();
vec.push((gene.to_string(), chromosome.to_string(), start, end));
}
}
vec
}
pub fn compare_gene_position(
position_a: &(String, String, u64, u64),
position_b: &(String, String, u64, u64),
) -> Ordering {
let (_gene_a, chromosome_a, start_a, _end_b) = position_a;
let (_gene_b, chromosome_b, start_b, _end_b) = position_b;
if chromosome_a < chromosome_b {
return Ordering::Less;
} else if chromosome_a > chromosome_b {
return Ordering::Greater;
} else {
if start_a < start_b {
return Ordering::Less;
} else {
return Ordering::Greater;
}
}
}
pub fn detect_tag_of_definitions(
definitions: &Vec<u64>,
gene_positions: &Vec<(String, String, u64, u64)>,
gene_families: &HashMap<String, u64>,
) -> HashMap<String, (String, Vec<Option<u64>>)> {
let mut tag_numbers: HashMap<String, (String, Vec<Option<u64>>)> = HashMap::new();
for definition in definitions {
let tag_numbers_def = detect_tag_of_definition(*definition, gene_positions, gene_families);
for (gene, (family, tag)) in tag_numbers_def {
if tag_numbers.contains_key(&gene) {
let (_gene, tags) = tag_numbers.get_mut(&gene).unwrap();
tags.push(tag);
} else {
tag_numbers.insert(gene, (family, vec![tag]));
}
}
}
tag_numbers
}
pub fn detect_tag_of_definition(
definition: u64,
gene_positions: &Vec<(String, String, u64, u64)>,
gene_families: &HashMap<String, u64>,
) -> HashMap<String, (String, Option<u64>)> {
let mut tag_numbers: HashMap<String, (String, Option<u64>)> = HashMap::new();
let mut spacer_index: u64 = 0;
let mut tag_index: u64 = 0;
for (i, (gene_i, chromosome_i, _start_i, _end_i)) in gene_positions.iter().enumerate() {
let family_repr: String;
let mut is_on_tag: bool = false;
if let Some(family_i) = gene_families.get(gene_i) {
family_repr = format!("{}", family_i);
let mut nb_intra_tag_spacers: u64 = 0;
if !tag_numbers.contains_key(gene_i) {
for (gene_j, chromosome_j, _start_j, _end_j) in &gene_positions[(i + 1)..] {
if *chromosome_i != *chromosome_j {
break;
}
if let Some(family_j) = gene_families.get(gene_j) {
if *family_i == *family_j {
nb_intra_tag_spacers = 0;
if !is_on_tag {
is_on_tag = true;
tag_index += 1;
tag_numbers.insert(gene_i.clone(), (family_repr.clone(), Some(tag_index)));
}
tag_numbers.insert(gene_j.clone(), (family_repr.clone(), Some(tag_index)));
}
} else {
nb_intra_tag_spacers += 1;
}
if nb_intra_tag_spacers > definition {
break;
}
}
if !is_on_tag {
tag_numbers.insert(gene_i.clone(), (family_repr, None));
}
}
} else {
spacer_index += 1;
family_repr = format!("spacer{}", spacer_index);
tag_numbers.insert(gene_i.clone(), (family_repr, None));
}
}
tag_numbers
}

View File

@ -1,57 +0,0 @@
use std::{collections::HashMap, path::PathBuf};
use clap::Parser;
use finder::{detect_tag_of_definitions, parse_families, parse_position};
mod finder;
#[derive(Parser)]
#[command(version="0.0.1", about="Find Tandemly Arrayed Genes", long_about="Find Tandemly Arrayed Genes in a genome based on gene position and Tandemly Arrayed Genes definition (i. e. number of spacers)")]
struct Cli {
#[arg(short='f', long="families", value_name="FAMILIES FILE")]
families: PathBuf,
#[arg(short='p', long="positions", value_name="POSITIONS FILE")]
positions: PathBuf,
#[arg(short='d', long="definitions", value_name="TAGs DEFINITIONS", help="Comma seperated integers for the number of spacers to consider")]
definitions: String
}
fn main() {
let cli = Cli::parse();
let gene_families: HashMap<String, u64> = parse_families(cli.families);
let mut gene_positions: Vec<(String, String, u64, u64)> = parse_position(cli.positions);
gene_positions.sort_by(|a, b | crate::finder::compare_gene_position(a, b));
let definitions: Vec<u64> = cli.definitions.split(",").map(|x| x.parse().unwrap()).collect();
let tag_numbers = detect_tag_of_definitions(&definitions, &gene_positions, &gene_families);
// Print header
print!("gene\tfamily\t");
for (i, definition) in definitions.iter().enumerate() {
print!("tag{}", definition);
if (i + 1) < definitions.len() {
print!("\t");
}
}
print!("\n");
for (gene, (family, tags)) in tag_numbers {
print!("{}\t{}\t", gene, family);
for (i, tag) in tags.iter().enumerate() {
if let Some(tag) = tag {
print!("{}", tag);
} else {
print!("-");
}
if (i + 1) < tags.len() {
print!("\t");
}
}
print!("\n");
}
}

View File

@ -1,4 +0,0 @@
A 1
B 1
C 2
D 2
1 A 1
2 B 1
3 C 2
4 D 2

View File

@ -1,6 +0,0 @@
A 1 1 1
B 1 2 1
C 2 3 1
U 2 4 1
V 2 5 1
D 2 10 1
1 A 1 1 1
2 B 1 2 1
3 C 2 3 1
4 U 2 4 1
5 V 2 5 1
6 D 2 10 1

View File

@ -1,6 +0,0 @@
name: clustalw
channels:
- conda-forge
- bioconda
dependencies:
- bioconda::clustalw

View File

@ -1,6 +0,0 @@
name: pal2nal
channels:
- conda-forge
- bioconda
dependencies:
- bioconda::pal2nal

View File

@ -1,6 +0,0 @@
name: paml
channels:
- conda-forge
- bioconda
dependencies:
- bioconda::paml

View File

@ -1,39 +0,0 @@
/** Compute Ka, Ks values for each
/** pair of duplicate genes in a TAG
/** and in a gene family
/** */
include { GUNZIP as GUNZIP_CDS } from "./modules/gunzip.nf"
include { KA_KS } from "./modules/ka_ks.nf"
process GENE_PAIRS {
input:
path families
output:
path 'pairs'
script:
"""
cat "${families}" | ${baseDir}/../../rust/pairs/target/release/pairs > 'pairs'
"""
}
workflow {
families = params.families
cds_gz = params.cds
proteome = params.proteome
cds = GUNZIP_CDS(cds_gz)
pairs = GENE_PAIRS(families)
.splitCsv(sep: '\t', header: false)
.map { row -> tuple(row[0], row[1]) }
kaks = KA_KS(pairs, proteome, cds)
// Save to a CSV with collectFile
header = Channel.value("gene_id_1\tgene_id_2\tKa\tKs")
header.concat( kaks )
.collectFile( name: 'test.txt', newLine: true, sort: false )
.view()
}

View File

@ -1,12 +0,0 @@
process GUNZIP {
input:
path zipped_file
output:
path zipped_file.baseName
script:
"""
gunzip -c "${zipped_file}" > "${zipped_file.baseName}"
"""
}

View File

@ -1,132 +0,0 @@
process EXTRACT_TWO_PROTEINS {
input:
tuple(val(gene_id_1), val(gene_id_2))
path fasta_file
output:
path "${gene_id_1}_${gene_id_2}.prot.fst"
script:
output_file="${gene_id_1}_${gene_id_2}.prot.fst"
"""
bash "${baseDir}/scripts/extract_fasta_records.sh" "${gene_id_1}" "${gene_id_2}" "${fasta_file}" "${output_file}"
"""
}
process EXTRACT_TWO_CDS {
input:
tuple(val(gene_id_1), val(gene_id_2))
path fasta_file
output:
path "${gene_id_1}_${gene_id_2}.cds.fst"
script:
output_file="${gene_id_1}_${gene_id_2}.cds.fst"
"""
bash "${baseDir}/scripts/extract_fasta_records.sh" "${gene_id_1}" "${gene_id_2}" "${fasta_file}" "${output_file}"
"""
}
process CLUSTALW2 {
label 'clustalw'
input:
path protein_sequence
output:
path "${protein_sequence.simpleName}.prot.ali.aln"
script:
"""
clustalw2 -quiet -align -infile="${protein_sequence}" -outfile="${protein_sequence.simpleName}.prot.ali.aln"
"""
}
process PAL2NAL {
label 'pal2nal'
input:
path protein_alignment
path coding_sequence
output:
path "${protein_alignment.simpleName}.cds.ali.phy"
script:
"""
pal2nal.pl "${protein_alignment}" "${coding_sequence}" -output paml > "${protein_alignment.simpleName}.cds.ali.phy"
"""
}
process YN00 {
label 'paml'
input:
path phylip_file
output:
path "${phylip_file.simpleName}.yn"
script:
"""
echo "seqfile = ${phylip_file} \noutfile = ${phylip_file.simpleName}.yn \nverbose = 0\nicode = 0\nweighting = 0\ncommonf3x4 = 0" > yn00.ctl
yn00
"""
}
process EXTRACT_KA_KS {
input:
tuple(val(gene_id_1), val(gene_id_2))
path yn_file
output:
path 'csv_row'
script:
"""
KaKs=\$(awk '
BEGIN {
OFS=","
on_good_section=0
skip=0
}
\$1 == "(B)" {
skip=8
on_good_section=1
}
on_good_section == 1 {
if (skip == 0) {
Ka=\$8
Ks=\$11
print Ka, Ks
exit
} else {
skip -= 1
}
}
' "${yn_file}")
arr=(\${KaKs//,/ })
Ka=\${arr[0]}
Ks=\${arr[1]}
echo "${gene_id_1}\t${gene_id_2}\t\${Ka}\t\${Ks}" > csv_row
"""
}
workflow KA_KS {
take:
gene_id_pair
proteome_fasta
cds_fasta
main:
protein_sequences = EXTRACT_TWO_PROTEINS(gene_id_pair, proteome_fasta)
cds_sequences = EXTRACT_TWO_CDS(gene_id_pair, cds_fasta)
protein_alignment = CLUSTALW2(protein_sequences)
phylip = PAL2NAL(protein_alignment, cds_sequences)
yn = YN00(phylip)
kaks = EXTRACT_KA_KS(gene_id_pair, yn)
emit:
kaks = kaks
}

View File

@ -1,26 +0,0 @@
#!/usr/bin/env sh
gene_id_1="${1}"
gene_id_2="${2}"
fasta_file="${3}"
output_file="${4}"
echo "" > "${output_file}"
for gene_id in ${gene_id_1} ${gene_id_2}; do
awk -v gene_id="${gene_id}" '
BEGIN {
on_gene=0
}
/^[^>]/ && on_gene == 1 {
print $0
}
/^>/ {
gene = $4
gsub("gene:", "", gene)
if (gene == gene_id) {
on_gene=1
print $0
} else {
on_gene=0
}
}
' "${fasta_file}" >> "${output_file}"
done

View File

@ -1,76 +0,0 @@
#!/usr/bin/env bash
# Bash version of the Nextflow script that should run faster
output_file="Glycine_max_duplicate_genes_KaKs.tsv"
proteome_fasta="/home/sortion/Documents/course/master/M2/S1/comparative_genomics/project/tmp/proteome_filtered.fa"
cds_fasta="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.cds.all.fa"
set -euo pipefail
mkdir -p tmp
echo "seqfile = ./tmp/cds.ali.phy
outfile = ./tmp/yn
verbose = 0
icode = 0
weighting = 0
commonf3x4 = 0" > yn00.ctl
compute_kaks() {
local gene_id_1
local gene_id_2
gene_id_1=$1
gene_id_2=$2
# Extract the proteins
./scripts/extract_fasta_records.sh $gene_id_1 $gene_id_2 ${proteome_fasta} "./tmp/prot.fst"
# Extract the CDS
./scripts/extract_fasta_records.sh $gene_id_1 $gene_id_2 ${cds_fasta} "./tmp/cds.fst"
# Run clustalw2
clustalw2 -quiet -align -infile="./tmp/prot.fst" -outfile="./tmp/prot.ali.aln"
# Run Pal2Nal
pal2nal.pl "./tmp/prot.ali.aln" "./tmp/cds.fst" -output paml > "./tmp/cds.ali.phy"
# Run yn00
yn00
# Extract Ka/Ks
awk '
BEGIN {
on_good_section=0
skip=0
}
$1 == "(B)" {
skip=8
on_good_section=1
}
on_good_section == 1 {
if (skip == 0) {
Ka=$8
Ks=$11
print Ka, Ks
exit
} else {
skip -= 1
}
}
' "./tmp/yn"
}
echo "gene_a,gene_b,ka,ks" > "${output_file}"
input="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/Glycine_max_duplicate_gene_pairs.tsv"
n=$(wc -l "${input}")
i=0
while IFS= read -r line
do
echo $i / $n
gene_a=$(echo $line | awk '{print $1}')
gene_b=$(echo $line | awk '{print $2}')
i=$(expr $i + 1)
kaks=$(compute_kaks $gene_a $gene_b | tail -1 | awk 'NF == 2')
if [[ ! -z $kaks ]]
then
arr=(${kaks//\t/ })
echo "${gene_a},${gene_b},${arr[0]},${arr[1]}" >> "${output_file}"
fi
done < "${input}"

View File

@ -8,26 +8,18 @@
nextflow.enable.dsl = 2; nextflow.enable.dsl = 2;
include { GUNZIP } from "./modules/gunzip.nf" include { GUNZIP } from "./modules/gunzip.nf"
include { BLAST_ALL_AGAINST_ALL } from "./modules/blast.nf" include { BLAST_MAKEBLASTDB } from "./modules/blast.nf"
include { BLAST_BLASTP } from "./modules/blast.nf"
include { FILTER_FASTA } from "./modules/filter_fasta.nf" include { FILTER_FASTA } from "./modules/filter_fasta.nf"
include { FILTER_BLASTP } from "./modules/filter_blastp.nf" include { FILTER_BLASTP } from "./modules/filter_blastp.nf"
include { CLUSTERING } from "./modules/clustering.nf" include { CLUSTERING } from "./modules/clustering.nf"
process PROTEIN_GENE_MAPPING {
input:
path proteome
output:
path 'protein_gene.tsv'
}
workflow { workflow {
proteome = Channel.fromPath(params.proteome) proteome = Channel.fromPath(params.proteome)
GUNZIP(proteome) GUNZIP(proteome)
FILTER_FASTA(GUNZIP.out) FILTER_FASTA(GUNZIP.out)
BLAST_ALL_AGAINST_ALL(FILTER_FASTA.out.proteome) BLAST_MAKEBLASTDB(params.species, FILTER_FASTA.out.proteome)
FILTER_BLASTP(params.min_coverage, params.min_identity, BLAST_ALL_AGAINST_ALL.out, FILTER_FASTA.out.lengths) BLAST_BLASTP(params.species, FILTER_FASTA.out.proteome, BLAST_MAKEBLASTDB.out)
FILTER_BLASTP(BLAST_BLASTP.out, FILTER_FASTA.out.protein_length)
CLUSTERING(FILTER_BLASTP.out) CLUSTERING(FILTER_BLASTP.out)
} }

View File

@ -34,19 +34,6 @@ process BLAST_BLASTP {
script: script:
""" """
blastp -query "${proteome}" -db 'db' -outfmt '6' -num_threads 7 -out "${species}.all-against-all.blastp.tsv" blastp -query "${proteome}" -db 'db' -outfmt '6' -out "${species}.all-against-all.blastp.tsv"
""" """
} }
workflow BLAST_ALL_AGAINST_ALL {
take:
proteome
main:
BLAST_MAKEBLASTDB(params.species, proteome)
BLAST_BLASTP(params.species, proteome, BLAST_MAKEBLASTDB.out)
emit:
BLAST_BLASTP.out
}

View File

@ -46,7 +46,7 @@ workflow CLUSTERING {
main: main:
BLASTP_TO_ABC(blastp_tsv) BLASTP_TO_ABC(blastp_tsv)
MCL(BLASTP_TO_ABC.out) MCL(BLASTP_TO_ABC).out
MCL_TO_TSV(MCL.out) MCL_TO_TSV(MCL.out)
emit: emit:

View File

@ -1,7 +1,6 @@
/** Filter blastp output based on coverage and identity percentage /** Filter blastp output based on coverage and identity percentage
/**/ /**/
process FILTER_BLASTP { process FILTER_BLASTP {
input: input:
@ -9,21 +8,18 @@ process FILTER_BLASTP {
val min_identity val min_identity
path blastp path blastp
path protein_length path protein_length
path protein_gene
output: output:
path 'filtered_blastp.tsv' path 'filtered_blastp.tsv'
script: script:
""" """
sort -k 1 "${blastp}" > blastp_s sort "${blastp}" > blastp_s
sort -k 1 "${protein_length}" > protein_length_s sort "${protein_length}" > protein_length_s
join -1 1 -2 1 -t'\t' blastp_s protein_length_s > join1.tsv join -1 1 -2 1 "blastp_s" "protein_length_s" > join1.tsv
sort -k 2 join1.tsv > join1.tsv_s sort -k 2 "join1.tsv" > join1.tsv_s
join -1 2 -2 1 -t'\t' join1.tsv_s' 'protein_length_s' > 'joined.blastp.tsv' join -1 2 -2 1 "join1.tsv_s" "protein_length_s" > 'joined.blastp.tsv'
awk -f "${baseDir}/scripts/filter_blastp.awk" \ awk -f "${baseDir}/scripts/filter_blastp.awk" -v coverage="${min_coverage}" -v identity "${min_identity}" \
-v coverage="${min_coverage}" \
-v identity="${min_identity}" \
"${blastp}" > 'filtered_blastp.tsv' "${blastp}" > 'filtered_blastp.tsv'
""" """
} }

View File

@ -1,21 +0,0 @@
#!/usr/bin/env -S awk -f
# Filter BLASTp format 6 file
# to keep only the records
# with ID in records.list
#
# Usage:
# awk -f filter_blastp_sequence_id.awk \
# records.list records.blastp.tsv
NR == FNR {
sequence_id = $1
to_remove[sequence_id] = 1
next
}
{
sequence_id = $1
if (!(sequence_id in to_remove)) {
print $0
}
}

View File

@ -1,30 +0,0 @@
#!/usr/bin/env -S awk -f
# Usage: $0 file.abc > filtered_file.abc
BEGIN {
OFS = FS = "\t"
}
{
a=$1
b=$2
c=$3
if (a > b) {
tmp=a
a=b
b=tmp
}
if (!(b in graph[a])) {
graph[a][b] = c
} else {
if (graph[a][b] < c) {
graph[a][b] = c
}
}
}
END {
for (a in graph) {
for (b in graph[a]) {
print a, b, graph[a][b]
}
}
}

View File

@ -5,12 +5,12 @@
BEGIN { BEGIN {
family_identifier=0 family_identifier=0
OFS="\t" OFS="\t"
FS=" "
} }
{ {
family_identifier++ family_identifier++
for (i=1; i <= NF; i++) { split($0, gene_list, " ")
print $i, family_identifier for (gene in gene_list) {
print gene, family_identifier
} }
} }

View File

@ -1,25 +1,19 @@
#!/usr/bin/env -S awk -f #!/usr/bin/env -S awk -f
# Associate a isoform id # Associate a isoform id to the length of the sequence
# to the length of the sequence
# and the associated gene id
BEGIN { BEGIN {
sequence_length=0 sequence_length=0
isoform_id="" isoform_id=""
gene_id=""
OFS="\t"
} }
/^>/ { /^>/ {
if (isoform_id != "") { if (isoform_id != "") {
print isoform_id, sequence_length, gene_id print isoform_id, sequence_length
} else {
isoform_id = $1
gsub(">", "", isoform_id)
sequence_length = 0
} }
isoform_id = $1
gene_id = $4
gsub(">", "", isoform_id)
gsub("gene:", "", gene_id)
sequence_length = 0
} }
/^[^>]/ { /^[^>]/ {
@ -28,6 +22,6 @@ BEGIN {
END { END {
if (isoform_id != "") { if (isoform_id != "") {
print isoform_id, sequence_length, gene_id print isoform_id, sequence_length
} }
} }