sequence-algorithms/content/chapters/part1/1.tex

443 lines
14 KiB
TeX
Raw Normal View History

2024-03-05 12:35:55 +01:00
\chapter{Motif}
2024-03-26 11:13:08 +01:00
\section{Searching a substring in a string}
2024-03-05 12:35:55 +01:00
\begin{algorithm}
2024-03-12 14:11:33 +01:00
\caption{Brute-force search of a motif in a sequence}
\begin{algorithmic}[1]
\Function{FindMotif}{$S$: Array($n$), $M$: Array($m$)}
\Returns{a list of position}
\State $pos \gets \{\}$
\State $i \gets 0$
\While {$i < n - m + 1$}
\State $j \gets 0$
\While {$j < m$ and $S[i+j] = M[j]$}
\State $j++$
\EndWhile
\If {$j = m$}
\State $pos \gets pos \cup \{i\}$
\EndIf
\State $i++$
\EndWhile
\State \Return $pos$
\EndFunction
\end{algorithmic}
\label{alg:naive-motif-matching}
2024-03-05 12:35:55 +01:00
\end{algorithm}
\begin{algorithm}
2024-03-12 14:11:33 +01:00
\caption{Knuth-Morris-Pratt algorithm}
\begin{algorithmic}[1]
\Function{KMP\_Search}{$S$: Array($n$), $M$: Array($m$)}
\Returns{Integer}
\State $table \gets$ \Call{KMP\_Table}{$M$}
\State $c \gets 0$ \Comment{Count the number of matches}
\State $i \gets 0$
\State $j \gets 0$
\While {$i < n$}
\If{$S[i] = M[i]$}
\State $i \gets i + 1$
\State $j \gets j + 1$
\EndIf
\If {$j = m$}
\State $c \gets c + 1$
\State $j \gets table[j-1]$
\ElsIf {$j < n$ and $M[j] \neq S[i]$}
\If {$j \neq 0$}
\State $j \gets table[j-1]$
\Else
\State $i \gets i + 1$
\EndIf
\EndIf`
\EndWhile
\State \Return $c$
\EndFunction
2024-03-05 12:35:55 +01:00
2024-03-12 14:11:33 +01:00
\Function{KMP\_Table}{M: Array(m)}
\State \textbf{Returns} Array(m)
\State $previous \gets 0$
\State $table \gets $ array of zeros of size m
\For {$i = 0$; $i < m$; $i++$}
\If {$M[i] = M[previous]$}
\State $previous \gets previous + 1$
\State $table[i] \gets previous$
\State $i \gets i + 1$
\Else
\If {$previous = 0$}
\State $previous \gets table[previous - 1]$
\Else
\State $table[i] \gets 0$
\State $i \gets 1$
\EndIf
\EndIf
\EndFor
\EndFunction
\end{algorithmic}
2024-03-05 12:35:55 +01:00
\end{algorithm}
2024-03-26 11:13:08 +01:00
\section{Using matrices to search motifs}
Let $S_{1}$ and $S_{2}$ be two sequences.
$S_{1} = $ ACGUUCC
$S_{2} = $ GUU
\begin{table}
\centering
\begin{tabular}{c|ccccccc}
& A & C & G & U & U & C & C \\
\hline
G & 0 & 0 & 1 & 0 & 0 & 0 & 0 \\
U & 0 & 0 & 0 & 1 & 1 & 0 & 0 \\
U & 0 & 0 & 0 & 1 & 1 & 0 & 0
\end{tabular}
\caption{Comparison matrix}
\end{table}
Let $n = |S_{1}|$, $m = |S_{2}|$
The complexity of this algorithm is $\mathcal{O}(n \cdot m)$ to build the matrix, and it requires also to find the diagonals and thus it is a bit less efficient than the \autoref{alg:naive-motif-matching}.
To find repetitions, we can use a comparison matrix with a single sequence against itself. A repetition would appear as a diagonal of ones, not on the main diagonal.
Let $S = $ ACGUUACGUU. Let's write the comparison matrix.
\begin{table}
\includegraphics{./figures/part1/comparison_matrix_repetitions.pdf}
\caption{Comparison matrix for $seq = $``ACGUUACGUUGUU"}
\end{table}
\begin{algorithm}
\caption{Construct a comparison matrix}
\begin{algorithmic}[1]
\Function{ComparisonMatrix}{$S$: Array($n$)}
\State $M \gets $ Array($n$, $n$)
\For{($i = 0$; $i < n$; $i++$)}
\For{$j = 0$; $j < n$; $j++$}
\If {$S[i] = S[j]$}
\State $M[i][j] = 1$
\Else
\State $M[i][j] = 0$
\EndIf
\EndFor
\EndFor
\State \Return $M$
\EndFunction
\end{algorithmic}
\end{algorithm}
\begin{algorithm}
\caption{Construct the top half of a comparison matrix}
\begin{algorithmic}[1]
\Function{ComparisonMatrix}{$S$: Array($n$)}
\State $M \gets$ Array($n$,$n$)
\For{($i = 0$; $i < n$; $i++$)}
\For{j=i; j < n; j++}
\If {S[i] = S[j]}
\State M[i][j] = 1
\Else
\State M[i][j] = 0
\EndIf
\EndFor
\EndFor
\State \Return M
\EndFunction
\end{algorithmic}
\end{algorithm}
\begin{algorithm}
\caption{Find repetitions (with a set of visited segments)}
\begin{algorithmic}[1]
\Function{FindRepetions}{$S$: Array($n$)}
\Returns{A list of start and end positions for repeated sequences}
\State $M = $ \Call{ComparisonMatrix}{S}
\State $pos = \{\}$
\State $visited = \{\}$
\For {($i_{start} = 0$; $i_{start} < n$; $i_{start}++$)}
\For {($j_{start} = i_{start}+1$; $j_{start} < n$; $j_{start}++$)}
\If{$M[i_{start}][j_{start}] = 1$ and $(i_{start}, j_{start}) \notin visited$}
\State $i = i_{start}$
\State $j = j_{start}$
\While {$M[i][j] = 1$}
\State $i++$
\State $j++$
\State $visited = visited \cup \{(i, j)\}$
\EndWhile
\State $pos = pos \cup \{(i_{start}, i), (j_{start},j)\}$
\EndIf
\EndFor
\EndFor
\EndFunction
\end{algorithmic}
\end{algorithm}
\begin{algorithm}
\caption{Find repetitions with an exploration of diagonals}
\begin{algorithmic}[1]
\Function{FindRepetions}{$S$: Array($n$)}
\Returns{A list of start and end positions for repeted sequences}
\State $M$ = \Call{ComparisonMatrix}{S}
\State $pos = \{\}$
\For {($diag = 1$; $diag < n$; $diag++$)}
\State $j = diag$
\State $i = 0$
\While {$i < n$ and $j < n$}
\If {$M[i][j] = 1$}
\State $i_{start} = i$
\State $j_{start} = j$
\While {$i < n$ and $j < n$ and $M[i][j] = 1$}
\State i++
\State j++
\EndWhile
\State $pos = pos \cup \{((i_{start},i-1),(j_{start},j-1))\}$
\EndIf
\State $i++$
\State $j++$
\State
\EndWhile
\EndFor
\EndFunction
\end{algorithmic}
\end{algorithm}
\begin{algorithm}
\caption{Find repetitions with an exploration of diagonals, without nested while}
\begin{algorithmic}[1]
\Function{FindRepetions}{$S$: Array($n$)}
\Returns{A list of start positions for repeted sequences and match length}
\State $M$ = \Call{ComparisonMatrix}{S}
\State $pos = \{\}$
\For {($diag = 1$; $diag < n$; $diag++$)}
\State $j = diag$
\State $i = 0$
\State $l = 0$
\While {$i < n$ and $j < n$}
\If {$M[i][j] = 1$}
\State $l++$
\Else
\If {$l > 0$}
\State $pos = pos \cup \{(i-l,j-l,l)\}$
\State $l = 0$
\EndIf
\EndIf
\State $i++$
\State $j++$
\EndWhile
\If {$l > 0$}
\State $pos = pos \cup \{((i-l,j-l,l))\}$
\EndIf
\EndFor
\State \Return $pos$
\EndFunction
\end{algorithmic}
\end{algorithm}
\begin{algorithm}
\caption{Find repetitions}
\begin{algorithmic}[1]
\Function{FindRepetions}{$S$: Array($n$)}
\Returns{A list of start and end positions for repeted sequences}
\State $M$ = \Call{ComparisonMatrix}{S}
\State $pos = \{\}$
\For {$i_{start} = 0$; $i_{start} < n$; $i_{start}++$}
\For {$j_{start} = i_{start}+1$; $j_{start} < n$; $j_{start}++$}
\If{$M[i_{start}][j_{start}] = 1$}
\State $i = i_{start}$
\State $j = j_{start}$
\While {$M[i][j] = 1$}
\State $M[i][j] = 0$ \Comment{Ensure that the segment is not explored again}
\State $i++$
\State $j++$
\EndWhile
\State $pos = pos \cup \{((i_{start}, i-1), (j_{start},j-1))\}$
\EndIf
\EndFor
\EndFor
\EndFunction
\end{algorithmic}
\end{algorithm}
\section{Automata}
An automaton is a tuple $\langle S, s_{0}, T, \Sigma,f\rangle$
\begin{itemize}
\item $S$ the set of states
\item $s_{0}$ the initial state
\item $T$ the set of terminal states
\item $\Sigma$ the alphabet
\item $f$ the transition function $f: (s_{1}, c) \to s_{2}$
\end{itemize}
\paragraph{Example} Given the language $L$ on the alphabet $\Sigma = \{A, C, T\}$, $L = \{A^{*}, CTT, CA^{*}\}$
\begin{definition}[Deterministic automaton]
2024-04-02 15:05:56 +02:00
An automaton is deterministic, if for each couple $(p, a) \in S \times \Sigma$ it exists at most a state $q$ such as $f(p, a) = q$
2024-03-26 11:13:08 +01:00
\end{definition}
\begin{definition}[Complete automaton]
2024-04-02 15:05:56 +02:00
An automaton is complete, if for each couple $(p, a) \in S \times \Sigma$ it exists at least a state $q$ such as $f(p, a) = q$.
2024-03-26 11:13:08 +01:00
\end{definition}
\begin{algorithm}
\caption{Check wether a word belong to a language for which we have an automaton}
\begin{algorithmic}[1]
\Function{WordInLanguage}{$W$: Array($n$), $A$: $\langle S, s_{0}, T, \Sigma,f \rangle$}
\Returns{A Boolean valued to \True{} if the word is recognized by the language automaton}
\State $s \gets s_{0}$
\State $i \gets 0$
\While {$i < n$}
\State $a \gets W[i]$
\If {$\exists f(s, a)$}
\State $s \gets f(s, a)$
\Else
\State \Return \False
\EndIf
\State i++
\EndWhile
\If {$s \in T$}
\State \Return \True
\Else
\State \Return \False
\EndIf
\EndFunction
\end{algorithmic}
\end{algorithm}
\subsection{Suffix Automaton}
Let $S = $ AACTACT
A suffix automata recognize all suffix of a given sequence.
The suffix language of $S$ is $\{S, ACTACT, CTACT, TACT, ACT, CT, T\}$.
\begin{figure}
\centering
\includegraphics{./figures/part1/minimal_suffix_automaton_exercise.pdf}
\caption{Suffix automaton for $S = $ AACTACT}
\end{figure}
\begin{figure}
\centering
\includegraphics{./figures/part1/minimal_suffix_automaton_exercise_bis.pdf}
\caption{Suffix automaton for $S = $ TCATCATT}
\end{figure}
\begin{algorithm}
\caption{Check if a sequences matches a motif, from a suffix automaton $\mathcal{O}(m)$, built from the automaton}
\begin{algorithmic}[1]
\Function{CheckMotifInSuffixAutomaton}{$W$: Array($m$), $A$: $\langle S, s_{0}, T, \Sigma,f \rangle$}
\Returns{Boolean valued to \True{} if the motif is in the sequence}
\State $s \gets s_{0}$
\State $i \gets 0$
\While {$i < m$ and $\exists f(s, W[i])$}
\State $s \gets f(s, W[i])$
\State $i++$
\EndWhile
\If {$i=n$}
\State \Return \True
\Else
\State \Return \False
\EndIf
\EndFunction
\end{algorithmic}
\end{algorithm}
The complexity of the pattern matching algorithm is $\mathcal{O}(n + m)$, because building the automaton is $\mathcal{O}(m)$
\subsection{Automata for motif search}
Let $M$ be a motif $M = $ ACAT.
\begin{figure}
\centering
\includegraphics{./figures/part1/motif_search_automaton.pdf}
\caption{Motif search automaton for $M = $ ACAT}
\end{figure}
The alphabet of motif is the same as the alphabet of the sequence.
The search automaton is complete.
If the there exists a letter $c$ in the sequence that is not
in the motif alphabet, we can make a virtual transition from
each state to the initial state whenever we encounter an unknown letter.
\begin{algorithm}
\caption{Search a motif in a sequence with an automaton}
\begin{algorithmic}[1]
\Function{SearchMotif}{$S$: Array($n$), $A$: $\langle S, s_{0}, T, \Sigma, f \rangle$, $P$: Array($m$)}
\Returns{A set of positions where the motif has been found}
\State $s \gets s_0$
\State $i \gets 0$
\State $pos \gets \{\}$
\While {$i < n$} % $\exists f(s, S[i])$ We assume $S$ and $P$ are formed on the same alphabet, so we could remove the second check, as $A$ is complete
\If {$s \in T$}
\State $pos \gets pos \cup \{ i - m \}$
\EndIf
\State $s \gets f(s, S[i])$
\State $i++$
\EndWhile
\State \Return $pos$
\EndFunction
\end{algorithmic}
\end{algorithm}
\begin{algorithm}
\caption{Check if the a motif automaton recognizes only the prefix of size $m-1$ of a motif $P$ of size $m$ }
\begin{algorithmic}[1]
\Function{SearchMotifLastPrefix}{$S$: Array($n$), $A$: $\langle S, s_{0}, T, \Sigma, f \rangle$, $P$: Array($m$)}
\Returns{A set of positions where the motif has been found}
\State $s \gets s_0$
\State $i \gets 0$
\State $T_{new} \gets \{\}$
\For {$s \in S$}
\For {$a \in \Sigma$}
\For {$t \in T$}
\If {$\exists f(s, a)$ and $f(s, a) = t$}
\State $T_{new} \gets T_{new} \cup s$
\EndIf
\EndFor
\EndFor
\EndFor
\While {$i < n$}
\If {$s \in T_{new}$}
\State \Return \True
\EndIf
\State $s \gets f(s, S[i])$
\State $i++$
\EndWhile
\State \Return \False
\EndFunction
\end{algorithmic}
\end{algorithm}
\begin{algorithm}
\caption{Check if the a motif automaton recognizes only the prefix of size $m-1$ of a motif $P$ of size $m$, knowing the sequence of the motif}
\begin{algorithmic}[1]
\Function{SearchMotifLastPrefix}{$S$: Array($n$), $A$: $\langle S, s_{0}, T, \Sigma, f \rangle$, $P$: Array($m$)}
\Returns{A set of positions where the motif has been found}
\State $s \gets s_0$
\State $i \gets 0$
\While {$i < n$ and $f(s, P[m-1]) \notin T$}
\State $s \gets f(s, S[i])$
\State $i++$
\EndWhile
\If{$f(s, P[m-1]) \in T$}
\State \Return \True
\Else
\State \Return \False
\EndIf
\EndFunction
\end{algorithmic}
2024-04-02 15:05:56 +02:00
\end{algorithm}