2024-03-12 14:11:33 +01:00
\chapter { Matrices}
Let $ S _ { 1 } $ and $ S _ { 2 } $ be two sequences.
$ S _ { 1 } = $ ACGUUCC
$ S _ { 2 } = $ GUU
\begin { table}
\centering
\begin { tabular} { c|ccccccc}
& A & C & G & U & U & C & C \\
\hline
G & 0 & 0 & 1 & 0 & 0 & 0 & 0 \\
U & 0 & 0 & 0 & 1 & 1 & 0 & 0 \\
U & 0 & 0 & 0 & 1 & 1 & 0 & 0
\end { tabular}
\caption { Comparison matrix}
\end { table}
Let $ n = |S _ { 1 } | $ , $ m = |S _ { 2 } | $
The complexity of this algorithm is $ \mathcal { O } ( n \cdot m ) $ to build the matrix, and it requires also to find the diagonals and thus it is a bit less efficient than the \autoref { alg:naive-motif-matching} .
To find repetitions, we can use a comparison matrix with a single sequence against itself. A repetition would appear as a diagonal of ones, not on the main diagonal.
Let $ S = $ ACGUUACGUU. Let's write the comparison matrix.
\begin { table}
\includegraphics { figures/part1/comparison_ matrix_ repetitions.pdf}
\caption { Comparison matrix for $ seq = $ ``ACGUUACGUUGUU"}
\end { table}
\begin { algorithm}
\caption { Construct a comparison matrix}
\begin { algorithmic} [1]
\Function { ComparisonMatrix} { $ S $ : Array($ n $ )}
\State $ M \gets $ Array($ n $ , $ n $ )
\For { ($ i = 0 $ ; $ i < n $ ; $ i + + $ )}
\For { $ j = 0 $ ; $ j < n $ ; $ j + + $ }
\If { $ S [ i ] = S [ j ] $ }
\State $ M [ i ] [ j ] = 1 $
\Else
\State $ M [ i ] [ j ] = 0 $
\EndIf
\EndFor
\EndFor
\State \Return $ M $
\EndFunction
\end { algorithmic}
\end { algorithm}
2024-03-15 11:40:26 +01:00
2024-03-12 14:11:33 +01:00
\begin { algorithm}
\caption { Construct the top half of a comparison matrix}
\begin { algorithmic} [1]
\Function { ComparisonMatrix} { $ S $ : Array($ n $ )}
\State $ M \gets $ Array($ n $ ,$ n $ )
\For { ($ i = 0 $ ; $ i < n $ ; $ i + + $ )}
\For { j=i; j < n; j++}
\If { S[i] = S[j]}
\State M[i][j] = 1
\Else
\State M[i][j] = 0
\EndIf
\EndFor
\EndFor
\State \Return M
\EndFunction
\end { algorithmic}
\end { algorithm}
\begin { algorithm}
\caption { Find repetitions (with a set of visited segments)}
\begin { algorithmic} [1]
\Function { FindRepetions} { $ S $ : Array($ n $ )}
\Returns { A list of start and end positions for repeated sequences}
\State $ M = $ \Call { ComparisonMatrix} { S}
\State $ pos = \{ \} $
\State $ visited = \{ \} $
\For { ($ i _ { start } = 0 $ ; $ i _ { start } < n $ ; $ i _ { start } + + $ )}
\For { ($ j _ { start } = i _ { start } + 1 $ ; $ j _ { start } < n $ ; $ j _ { start } + + $ )}
\If { $ M [ i _ { start } ] [ j _ { start } ] = 1 $ and $ ( i _ { start } , j _ { start } ) \notin visited $ }
\State $ i = i _ { start } $
\State $ j = j _ { start } $
\While { $ M [ i ] [ j ] = 1 $ }
\State $ i + + $
\State $ j + + $
\State $ visited = visited \cup \{ ( i, j ) \} $
\EndWhile
\State $ pos = pos \cup \{ ( i _ { start } , i ) , ( j _ { start } ,j ) \} $
\EndIf
\EndFor
\EndFor
\EndFunction
\end { algorithmic}
\end { algorithm}
\begin { algorithm}
\caption { Find repetitions with an exploration of diagonals}
\begin { algorithmic} [1]
\Function { FindRepetions} { $ S $ : Array($ n $ )}
\Returns { A list of start and end positions for repeted sequences}
\State $ M $ = \Call { ComparisonMatrix} { S}
\State $ pos = \{ \} $
\For { ($ diag = 1 $ ; $ diag < n $ ; $ diag + + $ )}
\State $ j = diag $
\State $ i = 0 $
\While { $ i < n $ and $ j < n $ }
\If { $ M [ i ] [ j ] = 1 $ }
\State $ i _ { start } = i $
\State $ j _ { start } = j $
\While { $ i < n $ and $ j < n $ and $ M [ i ] [ j ] = 1 $ }
\State i++
\State j++
\EndWhile
\State $ pos = pos \cup \{ ( ( i _ { start } ,i - 1 ) , ( j _ { start } ,j - 1 ) ) \} $
\EndIf
\State $ i + + $
\State $ j + + $
\State
\EndWhile
\EndFor
\EndFunction
\end { algorithmic}
\end { algorithm}
\begin { algorithm}
\caption { Find repetitions with an exploration of diagonals, without nested while}
\begin { algorithmic} [1]
\Function { FindRepetions} { $ S $ : Array($ n $ )}
\Returns { A list of start positions for repeted sequences and match length}
\State $ M $ = \Call { ComparisonMatrix} { S}
\State $ pos = \{ \} $
\For { ($ diag = 1 $ ; $ diag < n $ ; $ diag + + $ )}
\State $ j = diag $
\State $ i = 0 $
\State $ l = 0 $
\While { $ i < n $ and $ j < n $ }
\If { $ M [ i ] [ j ] = 1 $ }
\State $ l + + $
\Else
\If { $ l > 0 $ }
\State $ pos = pos \cup \{ ( i - l,j - l,l ) \} $
\State $ l = 0 $
\EndIf
\EndIf
\State $ i + + $
\State $ j + + $
\EndWhile
\If { $ l > 0 $ }
\State $ pos = pos \cup \{ ( ( i - l,j - l,l ) ) \} $
\EndIf
\EndFor
\State \Return $ pos $
\EndFunction
\end { algorithmic}
\end { algorithm}
\begin { algorithm}
\caption { Find repetitions}
\begin { algorithmic} [1]
\Function { FindRepetions} { $ S $ : Array($ n $ )}
\Returns { A list of start and end positions for repeted sequences}
\State $ M $ = \Call { ComparisonMatrix} { S}
\State $ pos = \{ \} $
\For { $ i _ { start } = 0 $ ; $ i _ { start } < n $ ; $ i _ { start } + + $ }
\For { $ j _ { start } = i _ { start } + 1 $ ; $ j _ { start } < n $ ; $ j _ { start } + + $ }
\If { $ M [ i _ { start } ] [ j _ { start } ] = 1 $ }
\State $ i = i _ { start } $
\State $ j = j _ { start } $
\While { $ M [ i ] [ j ] = 1 $ }
\State $ M [ i ] [ j ] = 0 $ \Comment { Ensure that the segment is not explored again}
\State $ i + + $
\State $ j + + $
\EndWhile
\State $ pos = pos \cup \{ ( ( i _ { start } , i - 1 ) , ( j _ { start } ,j - 1 ) ) \} $
\EndIf
\EndFor
\EndFor
\EndFunction
\end { algorithmic}
\end { algorithm}
\section { Automata}
An automaton is a tuple $ \langle S, s _ { 0 } , T, \Sigma ,f \rangle $
\begin { itemize}
\item $ S $ the set of states
\item $ s _ { 0 } $ the initial state
\item $ T $ the set of terminal states
\item $ \Sigma $ the alphabet
\item $ f $ the transition function $ f: ( s _ { 1 } , c ) \to s _ { 2 } $
\end { itemize}
\paragraph { Example} Given the language $ L $ on the alphabet $ \Sigma = \{ A, C, T \} $ , $ L = \{ A ^ { * } , CTT, CA ^ { * } \} $
\begin { definition} [Deterministic automaton]
An automaton is deterministic, if for each couple $ ( p, a ) \in S \times \Sigma $ it exists at most a state $ q $ such as $ f ( p, q ) = q $
\end { definition}
\begin { definition} [Complete automaton]
An automaton is complete, if for each couple $ ( p, a ) \in S \times \Sigma $ it exists at least a state $ q $ such as $ f ( p, q ) = q $ .
\end { definition}
\begin { algorithm}
\caption { Check wether a word belong to a language for which we have an automaton}
\begin { algorithmic} [1]
\Function { WordInLanguage} { $ W $ : Array($ n $ ), $ A $ : $ \langle S, s _ { 0 } , T, \Sigma ,f \rangle $ }
\Returns { A Boolean valued to \True { } if the word is recognized by the language automaton}
\State $ s \gets s _ { 0 } $
\State $ i \gets 0 $
\While { $ i < n $ }
\State $ a \gets W [ i ] $
\If { $ \exists f ( s, a ) $ }
\State $ s \gets f ( s, a ) $
\Else
\State \Return \False
\EndIf
\State i++
\EndWhile
\If { $ s \in T $ }
\State \Return \True
\Else
\State \Return \False
\EndIf
\EndFunction
\end { algorithmic}
\end { algorithm}
\section { Suffix Automaton}
Let $ S = $ AACTACT
A suffix automata recognize all suffix of a given sequence.
The suffix language of $ S $ is $ \{ S, ACTACT, CTACT, TACT, ACT, CT, T \} $ .
\begin { figure}
\centering
\includegraphics { figures/part1/minimal_ suffix_ automaton_ exercise.pdf}
\caption { Suffix automaton for $ S = $ AACTACT}
\end { figure}
\begin { figure}
\centering
\includegraphics { figures/part1/minimal_ suffix_ automaton_ exercise_ bis.pdf}
\caption { Suffix automaton for $ S = $ TCATCATT}
\end { figure}
\begin { algorithm}
2024-03-15 11:40:26 +01:00
\caption { Check if a sequences matches a motif, from a suffix automaton $ \mathcal { O } ( m ) $ , built from the automaton}
2024-03-12 14:11:33 +01:00
\begin { algorithmic} [1]
\Function { CheckMotifInSuffixAutomaton} { $ W $ : Array($ m $ ), $ A $ : $ \langle S, s _ { 0 } , T, \Sigma ,f \rangle $ }
\Returns { Boolean valued to \True { } if the motif is in the sequence}
\State $ s \gets s _ { 0 } $
\State $ i \gets 0 $
\While { $ i < m $ and $ \exists f ( s, W [ i ] ) $ }
\State $ s \gets f ( s, W [ i ] ) $
\State $ i + + $
\EndWhile
\If { $ i = n $ }
\State \Return \True
\Else
\State \Return \False
\EndIf
\EndFunction
\end { algorithmic}
\end { algorithm}
The complexity of the pattern matching algorithm is $ \mathcal { O } ( n + m ) $ , because building the automaton is $ \mathcal { O } ( m ) $