Longest common subsequence

Try to use includeonly to limit compilation time
This commit is contained in:
Samuel Ortion 2024-03-19 13:11:18 +01:00
parent fc0331e054
commit 18203b1e49
16 changed files with 739 additions and 135 deletions

41
.latexmkrc Normal file
View File

@ -0,0 +1,41 @@
sub createFolderStructure{
system("bash ./createFolderStructure.sh");
}
createFolderStructure();
$hash_calc_ignore_pattern{aux} =
'^\\\\gdef\\\\minted@oldcachelist\{,'
. '|^\s*default\.pygstyle,'
. '|^\s*[[:xdigit:]]+\.pygtex';
$pdflatex =
'lualatex -shell-escape -file-line-error -interaction=nonstopmode -synctex=1 -output-directory=build %O '
. '\'\PassOptionsToPackage{outputdir=build}{minted}\input{%S}\'';
$aux_dir = 'build';
$bibtex_use = 2;
# Amend cleaned extensions
$clean_ext .= " fdb_latexmk run.xml synctex.gz";
# Make latexmk quiet
$latexmk_silent = 1;
# Makeglossaries
add_cus_dep( 'acn', 'acr', 0, 'makeglossaries' );
add_cus_dep( 'glo', 'gls', 0, 'makeglossaries' );
$clean_ext .= " acr acn alg glo gls glg";
sub makeglossaries {
my ( $base_name, $path ) = fileparse( $_[0] );
my @args = ( "-q", "-d", $path, $base_name );
if ($silent) { unshift @args, "-q"; }
return system "makeglossaries", "-d", $path, $base_name;
}
sub biber {
my ( $base_name, $path ) = fileparse( $_[0] );
my @args = ( "--output-directory", $path, $base_name );
return system "biber", @args;
}

View File

@ -1,6 +0,0 @@
options=-shell-escape -interaction=nonstopmode -file-line-error
all: main.pdf
%.pdf: %.tex
lualatex $(options) $<

View File

@ -6,13 +6,12 @@
\foreach \i in {0, ..., #2} {%
\edef\FileName{content/chapters/#1/\i}%
\IfFileExists{\FileName}{%
\input{\FileName}%
\include{\FileName}%
}
}
}
\includechapters{part1}{3}
% \includechapters{part2}{2}
\includechapters{part2}{2}
% \includechapters{part3}{1}

View File

@ -28,7 +28,7 @@ Let $S = $ ACGUUACGUU. Let's write the comparison matrix.
\begin{table}
\includegraphics{figures/part1/comparison_matrix_repetitions.pdf}
\includegraphics{./figures/part1/comparison_matrix_repetitions.pdf}
\caption{Comparison matrix for $seq = $``ACGUUACGUUGUU"}
\end{table}
@ -248,17 +248,17 @@ The suffix language of $S$ is $\{S, ACTACT, CTACT, TACT, ACT, CT, T\}$.
\begin{figure}
\centering
\includegraphics{figures/part1/minimal_suffix_automaton_exercise.pdf}
\includegraphics{./figures/part1/minimal_suffix_automaton_exercise.pdf}
\caption{Suffix automaton for $S = $ AACTACT}
\end{figure}
\begin{figure}
\centering
\includegraphics{figures/part1/minimal_suffix_automaton_exercise_bis.pdf}
\includegraphics{./figures/part1/minimal_suffix_automaton_exercise_bis.pdf}
\caption{Suffix automaton for $S = $ TCATCATT}
\end{figure}
\begin{algorithm}
\begin{algorithm}
\caption{Check if a sequences matches a motif, from a suffix automaton $\mathcal{O}(m)$, built from the automaton}
\begin{algorithmic}[1]
\Function{CheckMotifInSuffixAutomaton}{$W$: Array($m$), $A$: $\langle S, s_{0}, T, \Sigma,f \rangle$}
@ -276,6 +276,6 @@ The suffix language of $S$ is $\{S, ACTACT, CTACT, TACT, ACT, CT, T\}$.
\EndIf
\EndFunction
\end{algorithmic}
\end{algorithm}
The complexity of the pattern matching algorithm is $\mathcal{O}(n + m)$, because building the automaton is $\mathcal{O}(m)$
\end{algorithm}
The complexity of the pattern matching algorithm is $\mathcal{O}(n + m)$, because building the automaton is $\mathcal{O}(m)$

View File

@ -4,7 +4,7 @@ Let $M$ be a motif $M = $ ACAT.
\begin{figure}
\centering
\includegraphics{figures/part1/motif_search_automaton.pdf}
\includegraphics{./figures/part1/motif_search_automaton.pdf}
\caption{Motif search automaton for $M = $ ACAT}
\end{figure}

View File

@ -0,0 +1,113 @@
\chapter{Longest common subsequence}
Let $S_{1} = \text{ATCTGAT}$ and $S_{2} = \text{TGCATA}$.
In this case the longest common subsequence of $S_{1}$ and $S_{2}$ is $TCTA$.
\begin{algorithm}
\caption{Construct a longest common subsequence matrix}
\begin{algorithmic}[1]
\Function{LCSQ\_Matrix}{$S_{1}$: Array($n$), $S_{2}$: Array($m$)}
\State $M \gets $ Array($m+1$, $n+1$)
\For{($i = 0$; $i < n+1$; $i++$)}
\For{$j = 0$; $j < m+1$; $j++$}
\If {$i = 0$ or $j = 0$}
\State $M[i][j] = 0$
\Else
\If {$S_{1}[i] = S_{2}[j]$}
\State $match = M[i-1][j-1] + 1$
\Else
\State $match = M[i-1][j-1]$
\EndIf
\State $gap_{1} = M[i-1][j]$
\State $gap_{2} = M[i][j-1]$
\State $M[i][j] = \max \{ match, gap_{1}, gap_{2}\}$
\EndIf
\EndFor
\EndFor
\State \Return $M$
\EndFunction
\end{algorithmic}
\end{algorithm}
\begin{algorithm}
\caption{Construct a longest common subsequence matrix keeping the path in memory}
\begin{algorithmic}[1]
\Function{LCSQ\_Matrix\_Path}{$S_{1}$: Array($n$), $S_{2}$: Array($m$)}
\State $M \gets $ Array($m+1$, $n+1$)
\State $P \gets $ Array($m+1$, $n+1$)
\For {($i = 0$; $i < n+1$, $i++$)}
\State $M[i][0] \gets 0$
\EndFor
\For {($j = 0$; $j < m+1$; $j+$)}
\State $M[0][j] \gets 0$
\EndFor
\For{($i = 1$; $i < n+1$; $i++$)}
\For{($j = 1$; $j < m+1$; $j++$)}
\If {$i = 1$ or $j = 0$}
\State $M[i][j] = 0$
\Else
\If {$S_{1}[i-1] = S_{2}[j-1]$}
\State $M[i][j] \gets M[i-1][j-1] + 1$
\State $P[i][j] \gets '\nwarrow'$
\ElsIf {$M[i][j-1] \geq M[i-1][j]$}
\State $M[i][j] \gets M[i][j-1]$
\State $P[i][j] \gets '\leftarrow'$
\Else
\State $M[i][j] \gets M[i-1][j]$
\State $P[i][j] \gets '\downarrow'$
\EndIf
\EndFor
\EndFor
\State \Return $M, P$
\EndFunction
\end{algorithmic}
\end{algorithm}
\begin{algorithm}
\caption{Backtrack the longest common subsequence}
\begin{algorithmic}[1]
\Function{LCSQ}{$S_{1}$: Array($n$), $S_{2}$: Array($m$)}
\State $M, P \gets $ \Call{LCSQ\_Matrix}{$S_{1}$, $S_{2}$}
\State $L \gets Array(M[n][m])$
\State $k \gets 0$
\State $i \gets n$
\State $j \gets m$
\While{$i > 0$ and $j > 0$}
\If {$P[i][j] = '\nwarrow' $}
\State $L[k] \gets S_{1}[i]$
\State $i--$
\State $j--$
\State $k++$
\ElsIf {$P[i][j] = '\leftarrow'$}
\State $j--$
\Else
\State $i--$
\EndIf
\EndWhile
\State \Return $L$
\EndFunction
\end{algorithmic}
\end{algorithm}
\begin{algorithm}
\caption{Recursive reconstruction of the longest common subsequence}
\begin{algorithmic}[1]
\Procedure{LCSQ}{$S_{1}$: Array($n$), $S_{2}$: Array($m$)}
\State $M, P \gets $ \Call{LCSQ\_Matrix}{$S_{1}$, $S_{2}$}
\State $i \gets n$
\State $j \gets m$
\State \Call{Aux}{$P$, $S_{1}$, $i$, $j$}
\EndProcedure
\Procedure{Aux}{$P$: Array($n+1$, $m+1$), $S_{1}$: Array($n$), $i$, $j$}
\If {$P[i][j] = '\nwarrow' $}
\State $l \gets S_{1}[i]$
\State \Call{Aux}{$P$, $S_{1}$, $i-1$, $j-1$}
\State \texttt{print}($l$)
\ElsIf {$P[i][j] = '\leftarrow'$}
\State \Call{Aux}{$P$, $S_{1}$, $i$, $j-1$}
\Else
\State \Call{Aux}{$P$, $S_{1}$, $i-1$, $j$}
\EndIf
\EndProcedure
\end{algorithmic}
\end{algorithm}

View File

@ -0,0 +1,24 @@
\part{Sequence alignment}
\section{Simililarity between sequences}
A function $d$ is a distance between two sequences $x$ and $y$ in an alphabet $\Sigma$ if
\begin{itemize}
\item $x, y \in \Sigma^{*}, d(x, x) = 0$
\item $\forall x, y \in \Sigma^{*}$ $d(x,y) = d(y,x)$
\item $\forall x, y, z \in \Sigma^{*}$ $d(x, z) \leq d(x, y) + d(x, z)$
\end{itemize}
Here we are interested by the distance that is able to represent the transformation of $x$ to $y$ using three types of basic operations:
\begin{itemize}
\item Substition
\item Insertion
\item Deletion
\end{itemize}
Example:
\begin{itemize}
\item $sub(a, b) = \begin{cases} 0 & \text{if} a = b \\ 1 &\text{otherwise} \end{cases}$.
\item $del(a) = 1$
\item $ins(a) = 1$
\end{itemize}

176
figures/part2/* Normal file
View File

@ -0,0 +1,176 @@
function lcsq_matrix(seq1, seq2)
local gap_penalty = 0
local match_score = 1
local n1 = string.len(seq1)
local n2 = string.len(seq2)
-- Create a n1 x n2 matrix
local matrix = {}
for i=0,n1 do
matrix[i] = {}
for j=0,n2 do
matrix[i][j] = 0
end
end
-- Fill the rest of the matrix
local match, delete, insert
for i=1,n1 do
for j=1,n2 do
if string.sub(seq1, i, i) == string.sub(seq2, j, j) then
match = matrix[i-1][j-1] + match_score
else
match = matrix[i-1][j-1]
end
gap1 = matrix[i-1][j] + gap_penalty
gap2 = matrix[i][j-1] + gap_penalty
matrix[i][j] = math.max(match, gap1, gap2)
end
end
return matrix
end
local function has_value (tab, val)
for index, value in ipairs(tab) do
if value == val then
return true
end
end
return false
end
function repr_matrix(matrix)
repr = ""
for i=1,#matrix do
for j=1,#matrix do
repr = repr .. matrix[i][j] .. " "
end
repr = repr .. "\n"
end
return repr
end
function draw_lcsq_matrix_graph(seq1, seq2)
local matrix = lcsq_matrix(seq1, seq2)
local tikz_code = ""
function coordinate(i, j)
return i .. "_" .. j
end
local steps = {
{-1, 0},
{-1, -1},
{0, -1}
}
local n1 = string.len(seq1)
local n2 = string.len(seq2)
local path = {}
local i = n1
local j = n2
while i >= 0 and j >= 0 do
path[#path+1] = coordinate(i, j)
local min = matrix[i][j]
local min_step = steps[1]
for index, step in ipairs(steps) do
local k = i + step[1]
local l = j + step[2]
if k >= 0 and l >= 0 and matrix[k][l] <= min then
min_step = step
min = matrix[k][l]
end
end
i = i + min_step[1]
j = j + min_step[2]
print(i, j)
end
-- Draw the matrix as tikz node with matrix value
for i=0,n1 do
for j=0,n2 do
local options = ""
if has_value(path, coordinate(i, j)) then
options = "[fill=gray, draw, minimum size=1]"
end
tikz_code = tikz_code .. "\\node" .. options .. " (" .. coordinate(i, j) .. ") at (" .. i .. ", " .. -j .. ")" .. " {" .. matrix[i][j] .. "};"
end
end
-- Add nucleotide labels
for i=1,n1 do
local nt = string.sub(seq1, i, i)
tikz_code = tikz_code .. "\\node at (".. i .. "," .. 1 .. ")" .. "{$" .. nt .."$};"
end
for i=1,n2 do
local nt = string.sub(seq2, i, i)
tikz_code = tikz_code .. "\\node at (" .. -1 .. ", " .. -i .. ")" .. "{$ ".. nt .."$};"
end
-- For seq2
for i=0,n1 do
for j=0,n2 do
local min = math.huge
for index, step in ipairs(steps) do
local k = i + step[1]
local l = j + step[2]
if k >= 0 and l >= 0 and matrix[k][l] < min then
min = matrix[k][l]
end
end
local highlighted = false
for index, step in ipairs(steps) do
local k = i + step[1]
local l = j + step[2]
if k >= 0 and l >= 0 and matrix[k][l] == min then
tikz_code = tikz_code .. "\\draw[->] (" .. coordinate(i, j) .. ")" .. " -- " .. "(" .. coordinate (k, l) .. ");"
end
end
end
end
return tikz_code
end
function draw_lcsq_matrix(seq1, seq2)
-- print(string.format(" Path: %s -> %s", seq1, seq2))
local matrix = lcsq_matrix(seq1, seq2)
local n1 = string.len(seq1)
local n2 = string.len(seq2)
-- Draw the matrix as tikz nodes
for i=0,n1-1 do
for j=0,n2-1 do
print(string.format("\\node[draw, minimum width=1cm, minimum height=1cm] at (%d, -%d) {};", i, j, matrix[i][j]))
end
end
-- Draw the sequence labels
for i=1,n1 do
print(string.format("\\node at (%d, -%d) {%s};", i-1, -1, string.sub(seq1, i, i)))
end
for i=1,n2 do
print(string.format("\\node at (%d, -%d) {%s};", -1, i-1, string.sub(seq2, i, i)))
end
-- Add a path from the bottom right corner to the top left corner, following the minimum of the three possible moves at each step
local i, j, value, previous_value
i = n1-1
j = n2-1
print(string.format("\\draw[-,line width=2, gray] (%d, -%d) --", i, j))
while i > 0 and j > 0 do
value = math.min(matrix[i-1][j-1], table[i-1][j], table[i][j-1])
if value == matrix[i-1][j-1] then
i = i - 1
j = j - 1
elseif value == matrix[i-1][j] then
i = i - 1
else
j = j - 1
end
print(string.format(" (%d, -%d) -- ", i, j))
end
print(string.format("(0, 0) -- (-1, 1);", i, j))
end
function main()
local seq1 = "ATCTGAT"
local seq2 = "TGCATA"
local matrix = lcsq_matrix(seq1, seq2)
print(repr_matrix(matrix))
end
main()

181
figures/part2/lcsq.lua Normal file
View File

@ -0,0 +1,181 @@
function lcsq_matrix(seq1, seq2)
local gap_penalty = 0
local match_score = 1
local n1 = string.len(seq1)
local n2 = string.len(seq2)
-- Create a n1 x n2 matrix
local matrix = {}
for i=0,n1 do
matrix[i] = {}
for j=0,n2 do
matrix[i][j] = 0
end
end
-- Fill the rest of the matrix
local match, delete, insert
for i=1,n1 do
for j=1,n2 do
if string.sub(seq1, i, i) == string.sub(seq2, j, j) then
match = matrix[i-1][j-1] + match_score
else
match = matrix[i-1][j-1]
end
gap1 = matrix[i-1][j] + gap_penalty
gap2 = matrix[i][j-1] + gap_penalty
matrix[i][j] = math.max(match, gap1, gap2)
end
end
return matrix
end
local function has_value (tab, val)
for index, value in ipairs(tab) do
if value == val then
return true
end
end
return false
end
function repr_matrix(matrix)
repr = ""
for i=0,#matrix do
for j=0,#matrix[i] do
repr = repr .. matrix[i][j] .. " "
end
repr = repr .. "\n"
end
return repr
end
function draw_lcsq_matrix_graph(seq1, seq2, matrix)
local tikz_code = ""
function coordinate(i, j)
return i .. "_" .. j
end
local steps = {
{-1, -1},
{0, -1},
{-1, 0},
}
local n1 = string.len(seq1)
local n2 = string.len(seq2)
local path = {}
local i = n1
local j = n2
while i >= 0 and j >= 0 do
path[#path+1] = coordinate(i, j)
local max = matrix[i][j]
local max_step = steps[1]
for index, step in ipairs(steps) do
local k = i + step[1]
local l = j + step[2]
if k >= 0 and l >= 0 and matrix[k][l] > max then
max_step = step
max = matrix[k][l]
end
end
i = i + max_step[1]
j = j + max_step[2]
end
-- Draw the matrix as tikz node with matrix value
for i=0,n1 do
for j=0,n2 do
local options = ""
if has_value(path, coordinate(i, j)) then
options = "[fill=gray, draw, minimum size=1]"
end
tikz_code = tikz_code .. "\\node" .. options .. " (" .. coordinate(i, j) .. ") at (" .. i .. ", " .. -j .. ")" .. " {" .. matrix[i][j] .. "};"
end
end
-- Add nucleotide labels
for i=1,n1 do
local nt = string.sub(seq1, i, i)
tikz_code = tikz_code .. "\\node at (".. i .. "," .. 1 .. ")" .. "{$" .. nt .."$};"
end
for i=1,n2 do
local nt = string.sub(seq2, i, i)
tikz_code = tikz_code .. "\\node at (" .. -1 .. ", " .. -i .. ")" .. "{$ ".. nt .."$};"
end
-- For seq2
for i=0,n1 do
for j=0,n2 do
local max = 0
for index, step in ipairs(steps) do
local k = i + step[1]
local l = j + step[2]
if k >= 0 and l >= 0 and matrix[k][l] > max then
max = matrix[k][l]
end
end
local highlighted = false
for index, step in ipairs(steps) do
local k = i + step[1]
local l = j + step[2]
if k >= 0 and l >= 0 and matrix[k][l] == max then
tikz_code = tikz_code .. "\\draw[->] (" .. coordinate(i, j) .. ")" .. " -- " .. "(" .. coordinate (k, l) .. ");"
end
end
end
end
return tikz_code
end
function draw_lcsq_matrix(seq1, seq2)
-- print(string.format(" Path: %s -> %s", seq1, seq2))
local matrix = lcsq_matrix(seq1, seq2)
local n1 = string.len(seq1)
local n2 = string.len(seq2)
local repr = ""
-- Draw the matrix as tikz nodes
for i=0,n1-1 do
for j=0,n2-1 do
repr = repr .. " " .. string.format("\\node[draw, minimum width=1cm, minimum height=1cm] at (%d, -%d) {};", i, j, matrix[i][j])
end
end
-- Draw the sequence labels
for i=1,n1 do
repr = repr .. " " .. string.format("\\node at (%d, -%d) {%s};", i-1, -1, string.sub(seq1, i, i))
end
for i=1,n2 do
repr = repr .. " " .. string.format("\\node at (%d, -%d) {%s};", -1, i-1, string.sub(seq2, i, i))
end
-- Add a path from the bottom right corner to the top left corner, following the minimum of the three possible moves at each step
local i, j, value, previous_value
i = n1-1
j = n2-1
repr = repr .. " " string.format("\\draw[-,line width=2, gray] (%d, -%d) --", i, j)
while i > 0 and j > 0 do
value = math.min(matrix[i-1][j-1], matrix[i-1][j], matrix[i][j-1])
if value == matrix[i-1][j-1] then
i = i - 1
j = j - 1
elseif value == matrix[i-1][j] then
i = i - 1
else
j = j - 1
end
repr = repr .. " " .. string.format(" (%d, -%d) -- ", i, j)
end
repr = repr .. " " .. string.format("(0, 0) -- (-1, 1);", i, j)
return repr
end
function main()
local seq1 = "ATCTGAT"
local seq2 = "TGCATA"
local matrix = lcsq_matrix(seq1, seq2)
print(draw_lcsq_matrix_graph(seq1, seq2, matrix))
end
-- main()
return {
lcsq_matrix=lcsq_matrix,
draw_lcsq_matrix_graph=draw_lcsq_matrix_graph,
draw_lcsq_matrix=draw_lcsq_matrix
}

BIN
figures/part2/lcsq.pdf (Stored with Git LFS) Normal file

Binary file not shown.

18
figures/part2/lcsq.tex Normal file
View File

@ -0,0 +1,18 @@
\documentclass[tikz]{standalone}
\usepackage{tikz}
\usepackage{luatextra}
\begin{document}
\begin{tikzpicture}
\begin{luacode}
lcsq = require('lcsq')
seq2 = "ATCTGAT"
seq1 = "TGCATA"
matrix = lcsq.lcsq_matrix(seq1, seq2)
tikz_code = lcsq.draw_lcsq_matrix_graph(seq1, seq2, matrix)
tex.print(tikz_code)
\end{luacode}
\end{tikzpicture}
\end{document}

8
folder-structure.sh Normal file
View File

@ -0,0 +1,8 @@
#!/bin/sh
find ./content -type d > folder_list.txt
mkdir -p build
cd build
cat ../folder_list.txt | xargs mkdir -p
rm ../folder_list.txt

BIN
main.pdf (Stored with Git LFS)

Binary file not shown.

View File

@ -12,7 +12,8 @@
fontsize=10pt,
fleqn,
oneside
]{scrbook}
]{scrbook}
\usepackage{mus}
@ -64,6 +65,7 @@
\definecolor{clementine}{HTML}{dfa000}
\colorlet{primary}{clementine}
% \includeonly{content/chapters/part1/1}
\makeindex%
\makeglossary%
\begin{document}
@ -77,10 +79,7 @@
\newpage
% \input{content ./introduction}
\input{content/chapters/include}
% \input{content/conclusion}
\end{document}

BIN
tmp.pdf (Stored with Git LFS)

Binary file not shown.

48
tmp.tex
View File

@ -11,7 +11,55 @@
\input{definitions.tex}
\begin{document}
\begin{algorithm}
\caption{Backtrack the longest common subsequence}
\begin{algorithmic}[1]
\Function{LCSQ}{$S_{1}$: Array($n$), $S_{2}$: Array($m$)}
\State $M, P \gets $ \Call{LCSQ\_Matrix}{$S_{1}$, $S_{2}$}
\State $L \gets Array(M[n][m])$
\State $k \gets 0$
\State $i \gets n$
\State $j \gets m$
\While{$i > 0$ and $j > 0$}
\If {$P[i][j] = '\nwarrow' $}
\State $L[k] \gets S_{1}[i]$
\State $i--$
\State $j--$
\State $k++$
\ElsIf {$P[i][j] = '\leftarrow'$}
\State $j--$
\Else
\State $i--$
\EndIf
\EndWhile
\State \Return $L$
\EndFunction
\end{algorithmic}
\end{algorithm}
\begin{algorithm}
\caption{Recursive reconstruction of the longest common subsequence}
\begin{algorithmic}[1]
\Procedure{LCSQ}{$S_{1}$: Array($n$), $S_{2}$: Array($m$)}
\State $M, P \gets $ \Call{LCSQ\_Matrix}{$S_{1}$, $S_{2}$}
\State $i \gets n$
\State $j \gets m$
\State \Call{Aux}{$P$, $S_{1}$, $i$, $j$}
\EndProcedure
\Procedure{Aux}{$P$: Array($n+1$, $m+1$), $S_{1}$: Array($n$), $i$, $j$}
\If {$P[i][j] = '\nwarrow' $}
\State $l \gets S_{1}[i]$
\State \Call{Aux}{$P$, $S_{1}$, $i-1$, $j-1$}
\State \texttt{print}($l$)
\ElsIf {$P[i][j] = '\leftarrow'$}
\State \Call{Aux}{$P$, $S_{1}$, $i$, $j-1$}
\Else
\State \Call{Aux}{$P$, $S_{1}$, $i-1$, $j$}
\EndIf
\EndProcedure
\end{algorithmic}
\end{algorithm}
\end{document}