Compare commits

...

5 Commits

Author SHA1 Message Date
Samuel Ortion 3e3c184153
Add a bash script to compute Ka/Ks 2024-12-24 11:31:02 +01:00
Samuel Ortion 5322a9686a
Add a workflow to extract Ka/Ks values
Nextflow workflow, but is stuck with memory overflow.
2024-12-24 11:28:30 +01:00
Samuel Ortion f766ebcc4a
Rust program to generate all gene pairs 2024-12-22 19:15:58 +01:00
Samuel Ortion 13f872cf10
Rust tagfinder 2024-11-08 12:42:59 +01:00
Samuel Ortion 53f6a60943
Update workflow 2024-11-04 11:37:20 +01:00
27 changed files with 1916 additions and 23 deletions

1009
notebook.org Normal file

File diff suppressed because it is too large Load Diff

6
rust/pairs/Cargo.toml Normal file
View File

@ -0,0 +1,6 @@
[package]
name = "pairs"
version = "0.1.0"
edition = "2021"
[dependencies]

31
rust/pairs/src/main.rs Normal file
View File

@ -0,0 +1,31 @@
fn print_gene_pairs(genes: &Vec<String>) {
for i in 0..(genes.len()-1) {
for j in (i+1)..(genes.len()) {
println!("{}\t{}", genes[i], genes[j]);
}
}
}
fn main() {
let mut family_index: String = "".to_string();
let mut family_genes: Vec<String> = Vec::new();
for line in std::io::stdin().lines() {
let line = line.unwrap();
let mut parts = line.split("\t");
let gene = parts.next().unwrap().to_string();
let family = parts.next().unwrap().to_string();
println!("{}", family);
if family != family_index {
family_index = family;
if family_genes.len() > 1 {
print_gene_pairs(&family_genes);
}
family_genes.clear();
}
family_genes.push(gene);
}
if family_genes.len() > 1 {
print_gene_pairs(&family_genes);
}
}

14
rust/tagfinder/.gitignore vendored Normal file
View File

@ -0,0 +1,14 @@
# Generated by Cargo
# will have compiled files and executables
debug/
target/
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
Cargo.lock
# These are backup files generated by rustfmt
**/*.rs.bk
# MSVC Windows builds of rustc generate these, which store debugging information
*.pdb

237
rust/tagfinder/Cargo.lock generated Normal file
View File

@ -0,0 +1,237 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 3
[[package]]
name = "anstream"
version = "0.6.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "23a1e53f0f5d86382dafe1cf314783b2044280f406e7e1506368220ad11b1338"
dependencies = [
"anstyle",
"anstyle-parse",
"anstyle-query",
"anstyle-wincon",
"colorchoice",
"is_terminal_polyfill",
"utf8parse",
]
[[package]]
name = "anstyle"
version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
[[package]]
name = "anstyle-parse"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9"
dependencies = [
"utf8parse",
]
[[package]]
name = "anstyle-query"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c"
dependencies = [
"windows-sys",
]
[[package]]
name = "anstyle-wincon"
version = "3.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2109dbce0e72be3ec00bed26e6a7479ca384ad226efdd66db8fa2e3a38c83125"
dependencies = [
"anstyle",
"windows-sys",
]
[[package]]
name = "clap"
version = "4.5.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b97f376d85a664d5837dbae44bf546e6477a679ff6610010f17276f686d867e8"
dependencies = [
"clap_builder",
"clap_derive",
]
[[package]]
name = "clap_builder"
version = "4.5.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19bc80abd44e4bed93ca373a0704ccbd1b710dc5749406201bb018272808dc54"
dependencies = [
"anstream",
"anstyle",
"clap_lex",
"strsim",
]
[[package]]
name = "clap_derive"
version = "4.5.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
dependencies = [
"heck",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "clap_lex"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
[[package]]
name = "colorchoice"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990"
[[package]]
name = "heck"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
[[package]]
name = "is_terminal_polyfill"
version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
[[package]]
name = "proc-macro2"
version = "1.0.89"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f139b0662de085916d1fb67d2b4169d1addddda1919e696f3252b740b629986e"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.37"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
dependencies = [
"proc-macro2",
]
[[package]]
name = "strsim"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "syn"
version = "2.0.87"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "25aa4ce346d03a6dcd68dd8b4010bcb74e54e62c90c573f394c46eae99aba32d"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "tagfinder"
version = "0.1.0"
dependencies = [
"clap",
]
[[package]]
name = "unicode-ident"
version = "1.0.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e91b56cd4cadaeb79bbf1a5645f6b4f8dc5bde8834ad5894a8db35fda9efa1fe"
[[package]]
name = "utf8parse"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
[[package]]
name = "windows-sys"
version = "0.59.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-targets"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_gnullvm",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_i686_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"

View File

@ -0,0 +1,7 @@
[package]
name = "tagfinder"
version = "0.1.0"
edition = "2021"
[dependencies]
clap = { version = "4.5.20", features = ["derive"] }

2
rust/tagfinder/Makefile Normal file
View File

@ -0,0 +1,2 @@
run:
cargo run -- --families test/samples/families.tsv --positions test/samples/position.tsv --definitions 0,1

View File

@ -0,0 +1,135 @@
use std::cmp::Ordering;
use std::collections::HashMap;
use std::fs::File;
use std::io::{self, BufRead};
use std::path::Path;
fn read_lines<P>(filename: P) -> io::Result<io::Lines<io::BufReader<File>>>
where
P: AsRef<Path>,
{
let file = File::open(filename)?;
Ok(io::BufReader::new(file).lines())
}
pub fn parse_families<P>(filename: P) -> HashMap<String, u64>
where
P: AsRef<Path>,
{
let mut map: HashMap<String, u64> = HashMap::new();
if let Ok(lines) = read_lines(filename) {
for line in lines.flatten() {
let parts: Vec<&str> = line.split("\t").collect();
let gene = parts[0];
let family: u64 = parts[1].parse().unwrap();
map.insert(gene.to_string(), family);
}
}
map
}
pub fn parse_position<P>(filename: P) -> Vec<(String, String, u64, u64)>
where
P: AsRef<Path>,
{
let mut vec: Vec<(String, String, u64, u64)> = Vec::new();
if let Ok(lines) = read_lines(filename) {
for line in lines.flatten() {
let parts: Vec<&str> = line.split("\t").collect();
let gene = parts[0];
let chromosome = parts[1];
let start: u64 = parts[2].parse().unwrap();
let end: u64 = parts[3].parse().unwrap();
vec.push((gene.to_string(), chromosome.to_string(), start, end));
}
}
vec
}
pub fn compare_gene_position(
position_a: &(String, String, u64, u64),
position_b: &(String, String, u64, u64),
) -> Ordering {
let (_gene_a, chromosome_a, start_a, _end_b) = position_a;
let (_gene_b, chromosome_b, start_b, _end_b) = position_b;
if chromosome_a < chromosome_b {
return Ordering::Less;
} else if chromosome_a > chromosome_b {
return Ordering::Greater;
} else {
if start_a < start_b {
return Ordering::Less;
} else {
return Ordering::Greater;
}
}
}
pub fn detect_tag_of_definitions(
definitions: &Vec<u64>,
gene_positions: &Vec<(String, String, u64, u64)>,
gene_families: &HashMap<String, u64>,
) -> HashMap<String, (String, Vec<Option<u64>>)> {
let mut tag_numbers: HashMap<String, (String, Vec<Option<u64>>)> = HashMap::new();
for definition in definitions {
let tag_numbers_def = detect_tag_of_definition(*definition, gene_positions, gene_families);
for (gene, (family, tag)) in tag_numbers_def {
if tag_numbers.contains_key(&gene) {
let (_gene, tags) = tag_numbers.get_mut(&gene).unwrap();
tags.push(tag);
} else {
tag_numbers.insert(gene, (family, vec![tag]));
}
}
}
tag_numbers
}
pub fn detect_tag_of_definition(
definition: u64,
gene_positions: &Vec<(String, String, u64, u64)>,
gene_families: &HashMap<String, u64>,
) -> HashMap<String, (String, Option<u64>)> {
let mut tag_numbers: HashMap<String, (String, Option<u64>)> = HashMap::new();
let mut spacer_index: u64 = 0;
let mut tag_index: u64 = 0;
for (i, (gene_i, chromosome_i, _start_i, _end_i)) in gene_positions.iter().enumerate() {
let family_repr: String;
let mut is_on_tag: bool = false;
if let Some(family_i) = gene_families.get(gene_i) {
family_repr = format!("{}", family_i);
let mut nb_intra_tag_spacers: u64 = 0;
if !tag_numbers.contains_key(gene_i) {
for (gene_j, chromosome_j, _start_j, _end_j) in &gene_positions[(i + 1)..] {
if *chromosome_i != *chromosome_j {
break;
}
if let Some(family_j) = gene_families.get(gene_j) {
if *family_i == *family_j {
nb_intra_tag_spacers = 0;
if !is_on_tag {
is_on_tag = true;
tag_index += 1;
tag_numbers.insert(gene_i.clone(), (family_repr.clone(), Some(tag_index)));
}
tag_numbers.insert(gene_j.clone(), (family_repr.clone(), Some(tag_index)));
}
} else {
nb_intra_tag_spacers += 1;
}
if nb_intra_tag_spacers > definition {
break;
}
}
if !is_on_tag {
tag_numbers.insert(gene_i.clone(), (family_repr, None));
}
}
} else {
spacer_index += 1;
family_repr = format!("spacer{}", spacer_index);
tag_numbers.insert(gene_i.clone(), (family_repr, None));
}
}
tag_numbers
}

View File

@ -0,0 +1,57 @@
use std::{collections::HashMap, path::PathBuf};
use clap::Parser;
use finder::{detect_tag_of_definitions, parse_families, parse_position};
mod finder;
#[derive(Parser)]
#[command(version="0.0.1", about="Find Tandemly Arrayed Genes", long_about="Find Tandemly Arrayed Genes in a genome based on gene position and Tandemly Arrayed Genes definition (i. e. number of spacers)")]
struct Cli {
#[arg(short='f', long="families", value_name="FAMILIES FILE")]
families: PathBuf,
#[arg(short='p', long="positions", value_name="POSITIONS FILE")]
positions: PathBuf,
#[arg(short='d', long="definitions", value_name="TAGs DEFINITIONS", help="Comma seperated integers for the number of spacers to consider")]
definitions: String
}
fn main() {
let cli = Cli::parse();
let gene_families: HashMap<String, u64> = parse_families(cli.families);
let mut gene_positions: Vec<(String, String, u64, u64)> = parse_position(cli.positions);
gene_positions.sort_by(|a, b | crate::finder::compare_gene_position(a, b));
let definitions: Vec<u64> = cli.definitions.split(",").map(|x| x.parse().unwrap()).collect();
let tag_numbers = detect_tag_of_definitions(&definitions, &gene_positions, &gene_families);
// Print header
print!("gene\tfamily\t");
for (i, definition) in definitions.iter().enumerate() {
print!("tag{}", definition);
if (i + 1) < definitions.len() {
print!("\t");
}
}
print!("\n");
for (gene, (family, tags)) in tag_numbers {
print!("{}\t{}\t", gene, family);
for (i, tag) in tags.iter().enumerate() {
if let Some(tag) = tag {
print!("{}", tag);
} else {
print!("-");
}
if (i + 1) < tags.len() {
print!("\t");
}
}
print!("\n");
}
}

View File

@ -0,0 +1,4 @@
A 1
B 1
C 2
D 2
1 A 1
2 B 1
3 C 2
4 D 2

View File

@ -0,0 +1,6 @@
A 1 1 1
B 1 2 1
C 2 3 1
U 2 4 1
V 2 5 1
D 2 10 1
1 A 1 1 1
2 B 1 2 1
3 C 2 3 1
4 U 2 4 1
5 V 2 5 1
6 D 2 10 1

View File

@ -0,0 +1,6 @@
name: clustalw
channels:
- conda-forge
- bioconda
dependencies:
- bioconda::clustalw

View File

@ -0,0 +1,6 @@
name: pal2nal
channels:
- conda-forge
- bioconda
dependencies:
- bioconda::pal2nal

View File

@ -0,0 +1,6 @@
name: paml
channels:
- conda-forge
- bioconda
dependencies:
- bioconda::paml

39
workflow/KaKs/main.nf Normal file
View File

@ -0,0 +1,39 @@
/** Compute Ka, Ks values for each
/** pair of duplicate genes in a TAG
/** and in a gene family
/** */
include { GUNZIP as GUNZIP_CDS } from "./modules/gunzip.nf"
include { KA_KS } from "./modules/ka_ks.nf"
process GENE_PAIRS {
input:
path families
output:
path 'pairs'
script:
"""
cat "${families}" | ${baseDir}/../../rust/pairs/target/release/pairs > 'pairs'
"""
}
workflow {
families = params.families
cds_gz = params.cds
proteome = params.proteome
cds = GUNZIP_CDS(cds_gz)
pairs = GENE_PAIRS(families)
.splitCsv(sep: '\t', header: false)
.map { row -> tuple(row[0], row[1]) }
kaks = KA_KS(pairs, proteome, cds)
// Save to a CSV with collectFile
header = Channel.value("gene_id_1\tgene_id_2\tKa\tKs")
header.concat( kaks )
.collectFile( name: 'test.txt', newLine: true, sort: false )
.view()
}

View File

@ -0,0 +1,12 @@
process GUNZIP {
input:
path zipped_file
output:
path zipped_file.baseName
script:
"""
gunzip -c "${zipped_file}" > "${zipped_file.baseName}"
"""
}

View File

@ -0,0 +1,132 @@
process EXTRACT_TWO_PROTEINS {
input:
tuple(val(gene_id_1), val(gene_id_2))
path fasta_file
output:
path "${gene_id_1}_${gene_id_2}.prot.fst"
script:
output_file="${gene_id_1}_${gene_id_2}.prot.fst"
"""
bash "${baseDir}/scripts/extract_fasta_records.sh" "${gene_id_1}" "${gene_id_2}" "${fasta_file}" "${output_file}"
"""
}
process EXTRACT_TWO_CDS {
input:
tuple(val(gene_id_1), val(gene_id_2))
path fasta_file
output:
path "${gene_id_1}_${gene_id_2}.cds.fst"
script:
output_file="${gene_id_1}_${gene_id_2}.cds.fst"
"""
bash "${baseDir}/scripts/extract_fasta_records.sh" "${gene_id_1}" "${gene_id_2}" "${fasta_file}" "${output_file}"
"""
}
process CLUSTALW2 {
label 'clustalw'
input:
path protein_sequence
output:
path "${protein_sequence.simpleName}.prot.ali.aln"
script:
"""
clustalw2 -quiet -align -infile="${protein_sequence}" -outfile="${protein_sequence.simpleName}.prot.ali.aln"
"""
}
process PAL2NAL {
label 'pal2nal'
input:
path protein_alignment
path coding_sequence
output:
path "${protein_alignment.simpleName}.cds.ali.phy"
script:
"""
pal2nal.pl "${protein_alignment}" "${coding_sequence}" -output paml > "${protein_alignment.simpleName}.cds.ali.phy"
"""
}
process YN00 {
label 'paml'
input:
path phylip_file
output:
path "${phylip_file.simpleName}.yn"
script:
"""
echo "seqfile = ${phylip_file} \noutfile = ${phylip_file.simpleName}.yn \nverbose = 0\nicode = 0\nweighting = 0\ncommonf3x4 = 0" > yn00.ctl
yn00
"""
}
process EXTRACT_KA_KS {
input:
tuple(val(gene_id_1), val(gene_id_2))
path yn_file
output:
path 'csv_row'
script:
"""
KaKs=\$(awk '
BEGIN {
OFS=","
on_good_section=0
skip=0
}
\$1 == "(B)" {
skip=8
on_good_section=1
}
on_good_section == 1 {
if (skip == 0) {
Ka=\$8
Ks=\$11
print Ka, Ks
exit
} else {
skip -= 1
}
}
' "${yn_file}")
arr=(\${KaKs//,/ })
Ka=\${arr[0]}
Ks=\${arr[1]}
echo "${gene_id_1}\t${gene_id_2}\t\${Ka}\t\${Ks}" > csv_row
"""
}
workflow KA_KS {
take:
gene_id_pair
proteome_fasta
cds_fasta
main:
protein_sequences = EXTRACT_TWO_PROTEINS(gene_id_pair, proteome_fasta)
cds_sequences = EXTRACT_TWO_CDS(gene_id_pair, cds_fasta)
protein_alignment = CLUSTALW2(protein_sequences)
phylip = PAL2NAL(protein_alignment, cds_sequences)
yn = YN00(phylip)
kaks = EXTRACT_KA_KS(gene_id_pair, yn)
emit:
kaks = kaks
}

View File

@ -0,0 +1,26 @@
#!/usr/bin/env sh
gene_id_1="${1}"
gene_id_2="${2}"
fasta_file="${3}"
output_file="${4}"
echo "" > "${output_file}"
for gene_id in ${gene_id_1} ${gene_id_2}; do
awk -v gene_id="${gene_id}" '
BEGIN {
on_gene=0
}
/^[^>]/ && on_gene == 1 {
print $0
}
/^>/ {
gene = $4
gsub("gene:", "", gene)
if (gene == gene_id) {
on_gene=1
print $0
} else {
on_gene=0
}
}
' "${fasta_file}" >> "${output_file}"
done

76
workflow/KaKs/ugly.sh Normal file
View File

@ -0,0 +1,76 @@
#!/usr/bin/env bash
# Bash version of the Nextflow script that should run faster
output_file="Glycine_max_duplicate_genes_KaKs.tsv"
proteome_fasta="/home/sortion/Documents/course/master/M2/S1/comparative_genomics/project/tmp/proteome_filtered.fa"
cds_fasta="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/data/Glycine_max.Glycine_max_v2.1.cds.all.fa"
set -euo pipefail
mkdir -p tmp
echo "seqfile = ./tmp/cds.ali.phy
outfile = ./tmp/yn
verbose = 0
icode = 0
weighting = 0
commonf3x4 = 0" > yn00.ctl
compute_kaks() {
local gene_id_1
local gene_id_2
gene_id_1=$1
gene_id_2=$2
# Extract the proteins
./scripts/extract_fasta_records.sh $gene_id_1 $gene_id_2 ${proteome_fasta} "./tmp/prot.fst"
# Extract the CDS
./scripts/extract_fasta_records.sh $gene_id_1 $gene_id_2 ${cds_fasta} "./tmp/cds.fst"
# Run clustalw2
clustalw2 -quiet -align -infile="./tmp/prot.fst" -outfile="./tmp/prot.ali.aln"
# Run Pal2Nal
pal2nal.pl "./tmp/prot.ali.aln" "./tmp/cds.fst" -output paml > "./tmp/cds.ali.phy"
# Run yn00
yn00
# Extract Ka/Ks
awk '
BEGIN {
on_good_section=0
skip=0
}
$1 == "(B)" {
skip=8
on_good_section=1
}
on_good_section == 1 {
if (skip == 0) {
Ka=$8
Ks=$11
print Ka, Ks
exit
} else {
skip -= 1
}
}
' "./tmp/yn"
}
echo "gene_a,gene_b,ka,ks" > "${output_file}"
input="/media/data/sync/Documents/course/master/M2/S1/comparative_genomics/project/tmp/Glycine_max_duplicate_gene_pairs.tsv"
n=$(wc -l "${input}")
i=0
while IFS= read -r line
do
echo $i / $n
gene_a=$(echo $line | awk '{print $1}')
gene_b=$(echo $line | awk '{print $2}')
i=$(expr $i + 1)
kaks=$(compute_kaks $gene_a $gene_b | tail -1 | awk 'NF == 2')
if [[ ! -z $kaks ]]
then
arr=(${kaks//\t/ })
echo "${gene_a},${gene_b},${arr[0]},${arr[1]}" >> "${output_file}"
fi
done < "${input}"

View File

@ -8,18 +8,26 @@
nextflow.enable.dsl = 2;
include { GUNZIP } from "./modules/gunzip.nf"
include { BLAST_MAKEBLASTDB } from "./modules/blast.nf"
include { BLAST_BLASTP } from "./modules/blast.nf"
include { BLAST_ALL_AGAINST_ALL } from "./modules/blast.nf"
include { FILTER_FASTA } from "./modules/filter_fasta.nf"
include { FILTER_BLASTP } from "./modules/filter_blastp.nf"
include { CLUSTERING } from "./modules/clustering.nf"
process PROTEIN_GENE_MAPPING {
input:
path proteome
output:
path 'protein_gene.tsv'
}
workflow {
proteome = Channel.fromPath(params.proteome)
GUNZIP(proteome)
FILTER_FASTA(GUNZIP.out)
BLAST_MAKEBLASTDB(params.species, FILTER_FASTA.out.proteome)
BLAST_BLASTP(params.species, FILTER_FASTA.out.proteome, BLAST_MAKEBLASTDB.out)
FILTER_BLASTP(BLAST_BLASTP.out, FILTER_FASTA.out.protein_length)
BLAST_ALL_AGAINST_ALL(FILTER_FASTA.out.proteome)
FILTER_BLASTP(params.min_coverage, params.min_identity, BLAST_ALL_AGAINST_ALL.out, FILTER_FASTA.out.lengths)
CLUSTERING(FILTER_BLASTP.out)
}

View File

@ -34,6 +34,19 @@ process BLAST_BLASTP {
script:
"""
blastp -query "${proteome}" -db 'db' -outfmt '6' -out "${species}.all-against-all.blastp.tsv"
blastp -query "${proteome}" -db 'db' -outfmt '6' -num_threads 7 -out "${species}.all-against-all.blastp.tsv"
"""
}
workflow BLAST_ALL_AGAINST_ALL {
take:
proteome
main:
BLAST_MAKEBLASTDB(params.species, proteome)
BLAST_BLASTP(params.species, proteome, BLAST_MAKEBLASTDB.out)
emit:
BLAST_BLASTP.out
}

View File

@ -46,7 +46,7 @@ workflow CLUSTERING {
main:
BLASTP_TO_ABC(blastp_tsv)
MCL(BLASTP_TO_ABC).out
MCL(BLASTP_TO_ABC.out)
MCL_TO_TSV(MCL.out)
emit:

View File

@ -1,6 +1,7 @@
/** Filter blastp output based on coverage and identity percentage
/**/
process FILTER_BLASTP {
input:
@ -8,18 +9,21 @@ process FILTER_BLASTP {
val min_identity
path blastp
path protein_length
path protein_gene
output:
path 'filtered_blastp.tsv'
script:
"""
sort "${blastp}" > blastp_s
sort "${protein_length}" > protein_length_s
join -1 1 -2 1 "blastp_s" "protein_length_s" > join1.tsv
sort -k 2 "join1.tsv" > join1.tsv_s
join -1 2 -2 1 "join1.tsv_s" "protein_length_s" > 'joined.blastp.tsv'
awk -f "${baseDir}/scripts/filter_blastp.awk" -v coverage="${min_coverage}" -v identity "${min_identity}" \
sort -k 1 "${blastp}" > blastp_s
sort -k 1 "${protein_length}" > protein_length_s
join -1 1 -2 1 -t'\t' blastp_s protein_length_s > join1.tsv
sort -k 2 join1.tsv > join1.tsv_s
join -1 2 -2 1 -t'\t' join1.tsv_s' 'protein_length_s' > 'joined.blastp.tsv'
awk -f "${baseDir}/scripts/filter_blastp.awk" \
-v coverage="${min_coverage}" \
-v identity="${min_identity}" \
"${blastp}" > 'filtered_blastp.tsv'
"""
}

View File

@ -0,0 +1,21 @@
#!/usr/bin/env -S awk -f
# Filter BLASTp format 6 file
# to keep only the records
# with ID in records.list
#
# Usage:
# awk -f filter_blastp_sequence_id.awk \
# records.list records.blastp.tsv
NR == FNR {
sequence_id = $1
to_remove[sequence_id] = 1
next
}
{
sequence_id = $1
if (!(sequence_id in to_remove)) {
print $0
}
}

View File

@ -0,0 +1,30 @@
#!/usr/bin/env -S awk -f
# Usage: $0 file.abc > filtered_file.abc
BEGIN {
OFS = FS = "\t"
}
{
a=$1
b=$2
c=$3
if (a > b) {
tmp=a
a=b
b=tmp
}
if (!(b in graph[a])) {
graph[a][b] = c
} else {
if (graph[a][b] < c) {
graph[a][b] = c
}
}
}
END {
for (a in graph) {
for (b in graph[a]) {
print a, b, graph[a][b]
}
}
}

View File

@ -5,12 +5,12 @@
BEGIN {
family_identifier=0
OFS="\t"
FS=" "
}
{
family_identifier++
split($0, gene_list, " ")
for (gene in gene_list) {
print gene, family_identifier
for (i=1; i <= NF; i++) {
print $i, family_identifier
}
}

View File

@ -1,19 +1,25 @@
#!/usr/bin/env -S awk -f
# Associate a isoform id to the length of the sequence
# Associate a isoform id
# to the length of the sequence
# and the associated gene id
BEGIN {
sequence_length=0
isoform_id=""
gene_id=""
OFS="\t"
}
/^>/ {
if (isoform_id != "") {
print isoform_id, sequence_length
} else {
isoform_id = $1
gsub(">", "", isoform_id)
sequence_length = 0
print isoform_id, sequence_length, gene_id
}
isoform_id = $1
gene_id = $4
gsub(">", "", isoform_id)
gsub("gene:", "", gene_id)
sequence_length = 0
}
/^[^>]/ {
@ -22,6 +28,6 @@ BEGIN {
END {
if (isoform_id != "") {
print isoform_id, sequence_length
print isoform_id, sequence_length, gene_id
}
}