\chapter{Motif} \section{Searching a substring in a string} \begin{algorithm} \caption{Brute-force search of a motif in a sequence} \begin{algorithmic}[1] \Function{FindMotif}{$S$: Array($n$), $M$: Array($m$)} \Returns{a list of position} \State $pos \gets \{\}$ \State $i \gets 0$ \While {$i < n - m + 1$} \State $j \gets 0$ \While {$j < m$ and $S[i+j] = M[j]$} \State $j++$ \EndWhile \If {$j = m$} \State $pos \gets pos \cup \{i\}$ \EndIf \State $i++$ \EndWhile \State \Return $pos$ \EndFunction \end{algorithmic} \label{alg:naive-motif-matching} \end{algorithm} \begin{algorithm} \caption{Knuth-Morris-Pratt algorithm} \begin{algorithmic}[1] \Function{KMP\_Search}{$S$: Array($n$), $M$: Array($m$)} \Returns{Integer} \State $table \gets$ \Call{KMP\_Table}{$M$} \State $c \gets 0$ \Comment{Count the number of matches} \State $i \gets 0$ \State $j \gets 0$ \While {$i < n$} \If{$S[i] = M[i]$} \State $i \gets i + 1$ \State $j \gets j + 1$ \EndIf \If {$j = m$} \State $c \gets c + 1$ \State $j \gets table[j-1]$ \ElsIf {$j < n$ and $M[j] \neq S[i]$} \If {$j \neq 0$} \State $j \gets table[j-1]$ \Else \State $i \gets i + 1$ \EndIf \EndIf` \EndWhile \State \Return $c$ \EndFunction \Function{KMP\_Table}{M: Array(m)} \State \textbf{Returns} Array(m) \State $previous \gets 0$ \State $table \gets $ array of zeros of size m \For {$i = 0$; $i < m$; $i++$} \If {$M[i] = M[previous]$} \State $previous \gets previous + 1$ \State $table[i] \gets previous$ \State $i \gets i + 1$ \Else \If {$previous = 0$} \State $previous \gets table[previous - 1]$ \Else \State $table[i] \gets 0$ \State $i \gets 1$ \EndIf \EndIf \EndFor \EndFunction \end{algorithmic} \end{algorithm} \section{Using matrices to search motifs} Let $S_{1}$ and $S_{2}$ be two sequences. $S_{1} = $ ACGUUCC $S_{2} = $ GUU \begin{table} \centering \begin{tabular}{c|ccccccc} & A & C & G & U & U & C & C \\ \hline G & 0 & 0 & 1 & 0 & 0 & 0 & 0 \\ U & 0 & 0 & 0 & 1 & 1 & 0 & 0 \\ U & 0 & 0 & 0 & 1 & 1 & 0 & 0 \end{tabular} \caption{Comparison matrix} \end{table} Let $n = |S_{1}|$, $m = |S_{2}|$ The complexity of this algorithm is $\mathcal{O}(n \cdot m)$ to build the matrix, and it requires also to find the diagonals and thus it is a bit less efficient than the \autoref{alg:naive-motif-matching}. To find repetitions, we can use a comparison matrix with a single sequence against itself. A repetition would appear as a diagonal of ones, not on the main diagonal. Let $S = $ ACGUUACGUU. Let's write the comparison matrix. \begin{table} \includegraphics{./figures/part1/comparison_matrix_repetitions.pdf} \caption{Comparison matrix for $seq = $``ACGUUACGUUGUU"} \end{table} \begin{algorithm} \caption{Construct a comparison matrix} \begin{algorithmic}[1] \Function{ComparisonMatrix}{$S$: Array($n$)} \State $M \gets $ Array($n$, $n$) \For{($i = 0$; $i < n$; $i++$)} \For{$j = 0$; $j < n$; $j++$} \If {$S[i] = S[j]$} \State $M[i][j] = 1$ \Else \State $M[i][j] = 0$ \EndIf \EndFor \EndFor \State \Return $M$ \EndFunction \end{algorithmic} \end{algorithm} \begin{algorithm} \caption{Construct the top half of a comparison matrix} \begin{algorithmic}[1] \Function{ComparisonMatrix}{$S$: Array($n$)} \State $M \gets$ Array($n$,$n$) \For{($i = 0$; $i < n$; $i++$)} \For{j=i; j < n; j++} \If {S[i] = S[j]} \State M[i][j] = 1 \Else \State M[i][j] = 0 \EndIf \EndFor \EndFor \State \Return M \EndFunction \end{algorithmic} \end{algorithm} \begin{algorithm} \caption{Find repetitions (with a set of visited segments)} \begin{algorithmic}[1] \Function{FindRepetions}{$S$: Array($n$)} \Returns{A list of start and end positions for repeated sequences} \State $M = $ \Call{ComparisonMatrix}{S} \State $pos = \{\}$ \State $visited = \{\}$ \For {($i_{start} = 0$; $i_{start} < n$; $i_{start}++$)} \For {($j_{start} = i_{start}+1$; $j_{start} < n$; $j_{start}++$)} \If{$M[i_{start}][j_{start}] = 1$ and $(i_{start}, j_{start}) \notin visited$} \State $i = i_{start}$ \State $j = j_{start}$ \While {$M[i][j] = 1$} \State $i++$ \State $j++$ \State $visited = visited \cup \{(i, j)\}$ \EndWhile \State $pos = pos \cup \{(i_{start}, i), (j_{start},j)\}$ \EndIf \EndFor \EndFor \EndFunction \end{algorithmic} \end{algorithm} \begin{algorithm} \caption{Find repetitions with an exploration of diagonals} \begin{algorithmic}[1] \Function{FindRepetions}{$S$: Array($n$)} \Returns{A list of start and end positions for repeted sequences} \State $M$ = \Call{ComparisonMatrix}{S} \State $pos = \{\}$ \For {($diag = 1$; $diag < n$; $diag++$)} \State $j = diag$ \State $i = 0$ \While {$i < n$ and $j < n$} \If {$M[i][j] = 1$} \State $i_{start} = i$ \State $j_{start} = j$ \While {$i < n$ and $j < n$ and $M[i][j] = 1$} \State i++ \State j++ \EndWhile \State $pos = pos \cup \{((i_{start},i-1),(j_{start},j-1))\}$ \EndIf \State $i++$ \State $j++$ \State \EndWhile \EndFor \EndFunction \end{algorithmic} \end{algorithm} \begin{algorithm} \caption{Find repetitions with an exploration of diagonals, without nested while} \begin{algorithmic}[1] \Function{FindRepetions}{$S$: Array($n$)} \Returns{A list of start positions for repeted sequences and match length} \State $M$ = \Call{ComparisonMatrix}{S} \State $pos = \{\}$ \For {($diag = 1$; $diag < n$; $diag++$)} \State $j = diag$ \State $i = 0$ \State $l = 0$ \While {$i < n$ and $j < n$} \If {$M[i][j] = 1$} \State $l++$ \Else \If {$l > 0$} \State $pos = pos \cup \{(i-l,j-l,l)\}$ \State $l = 0$ \EndIf \EndIf \State $i++$ \State $j++$ \EndWhile \If {$l > 0$} \State $pos = pos \cup \{((i-l,j-l,l))\}$ \EndIf \EndFor \State \Return $pos$ \EndFunction \end{algorithmic} \end{algorithm} \begin{algorithm} \caption{Find repetitions} \begin{algorithmic}[1] \Function{FindRepetions}{$S$: Array($n$)} \Returns{A list of start and end positions for repeted sequences} \State $M$ = \Call{ComparisonMatrix}{S} \State $pos = \{\}$ \For {$i_{start} = 0$; $i_{start} < n$; $i_{start}++$} \For {$j_{start} = i_{start}+1$; $j_{start} < n$; $j_{start}++$} \If{$M[i_{start}][j_{start}] = 1$} \State $i = i_{start}$ \State $j = j_{start}$ \While {$M[i][j] = 1$} \State $M[i][j] = 0$ \Comment{Ensure that the segment is not explored again} \State $i++$ \State $j++$ \EndWhile \State $pos = pos \cup \{((i_{start}, i-1), (j_{start},j-1))\}$ \EndIf \EndFor \EndFor \EndFunction \end{algorithmic} \end{algorithm} \section{Automata} An automaton is a tuple $\langle S, s_{0}, T, \Sigma,f\rangle$ \begin{itemize} \item $S$ the set of states \item $s_{0}$ the initial state \item $T$ the set of terminal states \item $\Sigma$ the alphabet \item $f$ the transition function $f: (s_{1}, c) \to s_{2}$ \end{itemize} \paragraph{Example} Given the language $L$ on the alphabet $\Sigma = \{A, C, T\}$, $L = \{A^{*}, CTT, CA^{*}\}$ \begin{definition}[Deterministic automaton] An automaton is deterministic, if for each couple $(p, a) \in S \times \Sigma$ it exists at most a state $q$ such as $f(p, a) = q$ \end{definition} \begin{definition}[Complete automaton] An automaton is complete, if for each couple $(p, a) \in S \times \Sigma$ it exists at least a state $q$ such as $f(p, a) = q$. \end{definition} \begin{algorithm} \caption{Check wether a word belong to a language for which we have an automaton} \begin{algorithmic}[1] \Function{WordInLanguage}{$W$: Array($n$), $A$: $\langle S, s_{0}, T, \Sigma,f \rangle$} \Returns{A Boolean valued to \True{} if the word is recognized by the language automaton} \State $s \gets s_{0}$ \State $i \gets 0$ \While {$i < n$} \State $a \gets W[i]$ \If {$\exists f(s, a)$} \State $s \gets f(s, a)$ \Else \State \Return \False \EndIf \State i++ \EndWhile \If {$s \in T$} \State \Return \True \Else \State \Return \False \EndIf \EndFunction \end{algorithmic} \end{algorithm} \subsection{Suffix Automaton} Let $S = $ AACTACT A suffix automata recognize all suffix of a given sequence. The suffix language of $S$ is $\{S, ACTACT, CTACT, TACT, ACT, CT, T\}$. \begin{figure} \centering \includegraphics{./figures/part1/minimal_suffix_automaton_exercise.pdf} \caption{Suffix automaton for $S = $ AACTACT} \end{figure} \begin{figure} \centering \includegraphics{./figures/part1/minimal_suffix_automaton_exercise_bis.pdf} \caption{Suffix automaton for $S = $ TCATCATT} \end{figure} \begin{algorithm} \caption{Check if a sequences matches a motif, from a suffix automaton $\mathcal{O}(m)$, built from the automaton} \begin{algorithmic}[1] \Function{CheckMotifInSuffixAutomaton}{$W$: Array($m$), $A$: $\langle S, s_{0}, T, \Sigma,f \rangle$} \Returns{Boolean valued to \True{} if the motif is in the sequence} \State $s \gets s_{0}$ \State $i \gets 0$ \While {$i < m$ and $\exists f(s, W[i])$} \State $s \gets f(s, W[i])$ \State $i++$ \EndWhile \If {$i=n$} \State \Return \True \Else \State \Return \False \EndIf \EndFunction \end{algorithmic} \end{algorithm} The complexity of the pattern matching algorithm is $\mathcal{O}(n + m)$, because building the automaton is $\mathcal{O}(m)$ \subsection{Automata for motif search} Let $M$ be a motif $M = $ ACAT. \begin{figure} \centering \includegraphics{./figures/part1/motif_search_automaton.pdf} \caption{Motif search automaton for $M = $ ACAT} \end{figure} The alphabet of motif is the same as the alphabet of the sequence. The search automaton is complete. If the there exists a letter $c$ in the sequence that is not in the motif alphabet, we can make a virtual transition from each state to the initial state whenever we encounter an unknown letter. \begin{algorithm} \caption{Search a motif in a sequence with an automaton} \begin{algorithmic}[1] \Function{SearchMotif}{$S$: Array($n$), $A$: $\langle S, s_{0}, T, \Sigma, f \rangle$, $P$: Array($m$)} \Returns{A set of positions where the motif has been found} \State $s \gets s_0$ \State $i \gets 0$ \State $pos \gets \{\}$ \While {$i < n$} % $\exists f(s, S[i])$ We assume $S$ and $P$ are formed on the same alphabet, so we could remove the second check, as $A$ is complete \If {$s \in T$} \State $pos \gets pos \cup \{ i - m \}$ \EndIf \State $s \gets f(s, S[i])$ \State $i++$ \EndWhile \State \Return $pos$ \EndFunction \end{algorithmic} \end{algorithm} \begin{algorithm} \caption{Check if the a motif automaton recognizes only the prefix of size $m-1$ of a motif $P$ of size $m$ } \begin{algorithmic}[1] \Function{SearchMotifLastPrefix}{$S$: Array($n$), $A$: $\langle S, s_{0}, T, \Sigma, f \rangle$, $P$: Array($m$)} \Returns{A set of positions where the motif has been found} \State $s \gets s_0$ \State $i \gets 0$ \State $T_{new} \gets \{\}$ \For {$s \in S$} \For {$a \in \Sigma$} \For {$t \in T$} \If {$\exists f(s, a)$ and $f(s, a) = t$} \State $T_{new} \gets T_{new} \cup s$ \EndIf \EndFor \EndFor \EndFor \While {$i < n$} \If {$s \in T_{new}$} \State \Return \True \EndIf \State $s \gets f(s, S[i])$ \State $i++$ \EndWhile \State \Return \False \EndFunction \end{algorithmic} \end{algorithm} \begin{algorithm} \caption{Check if the a motif automaton recognizes only the prefix of size $m-1$ of a motif $P$ of size $m$, knowing the sequence of the motif} \begin{algorithmic}[1] \Function{SearchMotifLastPrefix}{$S$: Array($n$), $A$: $\langle S, s_{0}, T, \Sigma, f \rangle$, $P$: Array($m$)} \Returns{A set of positions where the motif has been found} \State $s \gets s_0$ \State $i \gets 0$ \While {$i < n$ and $f(s, P[m-1]) \notin T$} \State $s \gets f(s, S[i])$ \State $i++$ \EndWhile \If{$f(s, P[m-1]) \in T$} \State \Return \True \Else \State \Return \False \EndIf \EndFunction \end{algorithmic} \end{algorithm}