\chapter{Matrices} Let $S_{1}$ and $S_{2}$ be two sequences. $S_{1} = $ ACGUUCC $S_{2} = $ GUU \begin{table} \centering \begin{tabular}{c|ccccccc} & A & C & G & U & U & C & C \\ \hline G & 0 & 0 & 1 & 0 & 0 & 0 & 0 \\ U & 0 & 0 & 0 & 1 & 1 & 0 & 0 \\ U & 0 & 0 & 0 & 1 & 1 & 0 & 0 \end{tabular} \caption{Comparison matrix} \end{table} Let $n = |S_{1}|$, $m = |S_{2}|$ The complexity of this algorithm is $\mathcal{O}(n \cdot m)$ to build the matrix, and it requires also to find the diagonals and thus it is a bit less efficient than the \autoref{alg:naive-motif-matching}. To find repetitions, we can use a comparison matrix with a single sequence against itself. A repetition would appear as a diagonal of ones, not on the main diagonal. Let $S = $ ACGUUACGUU. Let's write the comparison matrix. \begin{table} \includegraphics{figures/part1/comparison_matrix_repetitions.pdf} \caption{Comparison matrix for $seq = $``ACGUUACGUUGUU"} \end{table} \begin{algorithm} \caption{Construct a comparison matrix} \begin{algorithmic}[1] \Function{ComparisonMatrix}{$S$: Array($n$)} \State $M \gets $ Array($n$, $n$) \For{($i = 0$; $i < n$; $i++$)} \For{$j = 0$; $j < n$; $j++$} \If {$S[i] = S[j]$} \State $M[i][j] = 1$ \Else \State $M[i][j] = 0$ \EndIf \EndFor \EndFor \State \Return $M$ \EndFunction \end{algorithmic} \end{algorithm} \begin{algorithm} \caption{Construct the top half of a comparison matrix} \begin{algorithmic}[1] \Function{ComparisonMatrix}{$S$: Array($n$)} \State $M \gets$ Array($n$,$n$) \For{($i = 0$; $i < n$; $i++$)} \For{j=i; j < n; j++} \If {S[i] = S[j]} \State M[i][j] = 1 \Else \State M[i][j] = 0 \EndIf \EndFor \EndFor \State \Return M \EndFunction \end{algorithmic} \end{algorithm} \begin{algorithm} \caption{Find repetitions (with a set of visited segments)} \begin{algorithmic}[1] \Function{FindRepetions}{$S$: Array($n$)} \Returns{A list of start and end positions for repeated sequences} \State $M = $ \Call{ComparisonMatrix}{S} \State $pos = \{\}$ \State $visited = \{\}$ \For {($i_{start} = 0$; $i_{start} < n$; $i_{start}++$)} \For {($j_{start} = i_{start}+1$; $j_{start} < n$; $j_{start}++$)} \If{$M[i_{start}][j_{start}] = 1$ and $(i_{start}, j_{start}) \notin visited$} \State $i = i_{start}$ \State $j = j_{start}$ \While {$M[i][j] = 1$} \State $i++$ \State $j++$ \State $visited = visited \cup \{(i, j)\}$ \EndWhile \State $pos = pos \cup \{(i_{start}, i), (j_{start},j)\}$ \EndIf \EndFor \EndFor \EndFunction \end{algorithmic} \end{algorithm} \begin{algorithm} \caption{Find repetitions with an exploration of diagonals} \begin{algorithmic}[1] \Function{FindRepetions}{$S$: Array($n$)} \Returns{A list of start and end positions for repeted sequences} \State $M$ = \Call{ComparisonMatrix}{S} \State $pos = \{\}$ \For {($diag = 1$; $diag < n$; $diag++$)} \State $j = diag$ \State $i = 0$ \While {$i < n$ and $j < n$} \If {$M[i][j] = 1$} \State $i_{start} = i$ \State $j_{start} = j$ \While {$i < n$ and $j < n$ and $M[i][j] = 1$} \State i++ \State j++ \EndWhile \State $pos = pos \cup \{((i_{start},i-1),(j_{start},j-1))\}$ \EndIf \State $i++$ \State $j++$ \State \EndWhile \EndFor \EndFunction \end{algorithmic} \end{algorithm} \begin{algorithm} \caption{Find repetitions with an exploration of diagonals, without nested while} \begin{algorithmic}[1] \Function{FindRepetions}{$S$: Array($n$)} \Returns{A list of start positions for repeted sequences and match length} \State $M$ = \Call{ComparisonMatrix}{S} \State $pos = \{\}$ \For {($diag = 1$; $diag < n$; $diag++$)} \State $j = diag$ \State $i = 0$ \State $l = 0$ \While {$i < n$ and $j < n$} \If {$M[i][j] = 1$} \State $l++$ \Else \If {$l > 0$} \State $pos = pos \cup \{(i-l,j-l,l)\}$ \State $l = 0$ \EndIf \EndIf \State $i++$ \State $j++$ \EndWhile \If {$l > 0$} \State $pos = pos \cup \{((i-l,j-l,l))\}$ \EndIf \EndFor \State \Return $pos$ \EndFunction \end{algorithmic} \end{algorithm} \begin{algorithm} \caption{Find repetitions} \begin{algorithmic}[1] \Function{FindRepetions}{$S$: Array($n$)} \Returns{A list of start and end positions for repeted sequences} \State $M$ = \Call{ComparisonMatrix}{S} \State $pos = \{\}$ \For {$i_{start} = 0$; $i_{start} < n$; $i_{start}++$} \For {$j_{start} = i_{start}+1$; $j_{start} < n$; $j_{start}++$} \If{$M[i_{start}][j_{start}] = 1$} \State $i = i_{start}$ \State $j = j_{start}$ \While {$M[i][j] = 1$} \State $M[i][j] = 0$ \Comment{Ensure that the segment is not explored again} \State $i++$ \State $j++$ \EndWhile \State $pos = pos \cup \{((i_{start}, i-1), (j_{start},j-1))\}$ \EndIf \EndFor \EndFor \EndFunction \end{algorithmic} \end{algorithm} \section{Automata} An automaton is a tuple $\langle S, s_{0}, T, \Sigma,f\rangle$ \begin{itemize} \item $S$ the set of states \item $s_{0}$ the initial state \item $T$ the set of terminal states \item $\Sigma$ the alphabet \item $f$ the transition function $f: (s_{1}, c) \to s_{2}$ \end{itemize} \paragraph{Example} Given the language $L$ on the alphabet $\Sigma = \{A, C, T\}$, $L = \{A^{*}, CTT, CA^{*}\}$ \begin{definition}[Deterministic automaton] An automaton is deterministic, if for each couple $(p, a) \in S \times \Sigma$ it exists at most a state $q$ such as $f(p, q) = q$ \end{definition} \begin{definition}[Complete automaton] An automaton is complete, if for each couple $(p, a) \in S \times \Sigma$ it exists at least a state $q$ such as $f(p, q) = q$. \end{definition} \begin{algorithm} \caption{Check wether a word belong to a language for which we have an automaton} \begin{algorithmic}[1] \Function{WordInLanguage}{$W$: Array($n$), $A$: $\langle S, s_{0}, T, \Sigma,f \rangle$} \Returns{A Boolean valued to \True{} if the word is recognized by the language automaton} \State $s \gets s_{0}$ \State $i \gets 0$ \While {$i < n$} \State $a \gets W[i]$ \If {$\exists f(s, a)$} \State $s \gets f(s, a)$ \Else \State \Return \False \EndIf \State i++ \EndWhile \If {$s \in T$} \State \Return \True \Else \State \Return \False \EndIf \EndFunction \end{algorithmic} \end{algorithm} \section{Suffix Automaton} Let $S = $ AACTACT A suffix automata recognize all suffix of a given sequence. The suffix language of $S$ is $\{S, ACTACT, CTACT, TACT, ACT, CT, T\}$. \begin{figure} \centering \includegraphics{figures/part1/minimal_suffix_automaton_exercise.pdf} \caption{Suffix automaton for $S = $ AACTACT} \end{figure} \begin{figure} \centering \includegraphics{figures/part1/minimal_suffix_automaton_exercise_bis.pdf} \caption{Suffix automaton for $S = $ TCATCATT} \end{figure} \begin{algorithm} \caption{Check if a sequences matches a motif, from a suffix automaton $\mathcal{O}(m)$, built from the automaton} \begin{algorithmic}[1] \Function{CheckMotifInSuffixAutomaton}{$W$: Array($m$), $A$: $\langle S, s_{0}, T, \Sigma,f \rangle$} \Returns{Boolean valued to \True{} if the motif is in the sequence} \State $s \gets s_{0}$ \State $i \gets 0$ \While {$i < m$ and $\exists f(s, W[i])$} \State $s \gets f(s, W[i])$ \State $i++$ \EndWhile \If {$i=n$} \State \Return \True \Else \State \Return \False \EndIf \EndFunction \end{algorithmic} \end{algorithm} The complexity of the pattern matching algorithm is $\mathcal{O}(n + m)$, because building the automaton is $\mathcal{O}(m)$