2024-03-05 12:35:55 +01:00
\chapter { Motif}
2024-03-26 11:13:08 +01:00
\section { Searching a substring in a string}
2024-03-05 12:35:55 +01:00
\begin { algorithm}
2024-03-12 14:11:33 +01:00
\caption { Brute-force search of a motif in a sequence}
\begin { algorithmic} [1]
\Function { FindMotif} { $ S $ : Array($ n $ ), $ M $ : Array($ m $ )}
\Returns { a list of position}
\State $ pos \gets \{ \} $
\State $ i \gets 0 $
\While { $ i < n - m + 1 $ }
\State $ j \gets 0 $
\While { $ j < m $ and $ S [ i + j ] = M [ j ] $ }
\State $ j + + $
\EndWhile
\If { $ j = m $ }
\State $ pos \gets pos \cup \{ i \} $
\EndIf
\State $ i + + $
\EndWhile
\State \Return $ pos $
\EndFunction
\end { algorithmic}
\label { alg:naive-motif-matching}
2024-03-05 12:35:55 +01:00
\end { algorithm}
\begin { algorithm}
2024-03-12 14:11:33 +01:00
\caption { Knuth-Morris-Pratt algorithm}
\begin { algorithmic} [1]
\Function { KMP\_ Search} { $ S $ : Array($ n $ ), $ M $ : Array($ m $ )}
\Returns { Integer}
\State $ table \gets $ \Call { KMP\_ Table} { $ M $ }
\State $ c \gets 0 $ \Comment { Count the number of matches}
\State $ i \gets 0 $
\State $ j \gets 0 $
\While { $ i < n $ }
\If { $ S [ i ] = M [ i ] $ }
\State $ i \gets i + 1 $
\State $ j \gets j + 1 $
\EndIf
\If { $ j = m $ }
\State $ c \gets c + 1 $
\State $ j \gets table [ j - 1 ] $
\ElsIf { $ j < n $ and $ M [ j ] \neq S [ i ] $ }
\If { $ j \neq 0 $ }
\State $ j \gets table [ j - 1 ] $
\Else
\State $ i \gets i + 1 $
\EndIf
\EndIf `
\EndWhile
\State \Return $ c $
\EndFunction
2024-03-05 12:35:55 +01:00
2024-03-12 14:11:33 +01:00
\Function { KMP\_ Table} { M: Array(m)}
\State \textbf { Returns} Array(m)
\State $ previous \gets 0 $
\State $ table \gets $ array of zeros of size m
\For { $ i = 0 $ ; $ i < m $ ; $ i + + $ }
\If { $ M [ i ] = M [ previous ] $ }
\State $ previous \gets previous + 1 $
\State $ table [ i ] \gets previous $
\State $ i \gets i + 1 $
\Else
\If { $ previous = 0 $ }
\State $ previous \gets table [ previous - 1 ] $
\Else
\State $ table [ i ] \gets 0 $
\State $ i \gets 1 $
\EndIf
\EndIf
\EndFor
\EndFunction
\end { algorithmic}
2024-03-05 12:35:55 +01:00
\end { algorithm}
2024-03-26 11:13:08 +01:00
\section { Using matrices to search motifs}
Let $ S _ { 1 } $ and $ S _ { 2 } $ be two sequences.
$ S _ { 1 } = $ ACGUUCC
$ S _ { 2 } = $ GUU
\begin { table}
\centering
\begin { tabular} { c|ccccccc}
& A & C & G & U & U & C & C \\
\hline
G & 0 & 0 & 1 & 0 & 0 & 0 & 0 \\
U & 0 & 0 & 0 & 1 & 1 & 0 & 0 \\
U & 0 & 0 & 0 & 1 & 1 & 0 & 0
\end { tabular}
\caption { Comparison matrix}
\end { table}
Let $ n = |S _ { 1 } | $ , $ m = |S _ { 2 } | $
The complexity of this algorithm is $ \mathcal { O } ( n \cdot m ) $ to build the matrix, and it requires also to find the diagonals and thus it is a bit less efficient than the \autoref { alg:naive-motif-matching} .
To find repetitions, we can use a comparison matrix with a single sequence against itself. A repetition would appear as a diagonal of ones, not on the main diagonal.
Let $ S = $ ACGUUACGUU. Let's write the comparison matrix.
\begin { table}
\includegraphics { ./figures/part1/comparison_ matrix_ repetitions.pdf}
\caption { Comparison matrix for $ seq = $ ``ACGUUACGUUGUU"}
\end { table}
\begin { algorithm}
\caption { Construct a comparison matrix}
\begin { algorithmic} [1]
\Function { ComparisonMatrix} { $ S $ : Array($ n $ )}
\State $ M \gets $ Array($ n $ , $ n $ )
\For { ($ i = 0 $ ; $ i < n $ ; $ i + + $ )}
\For { $ j = 0 $ ; $ j < n $ ; $ j + + $ }
\If { $ S [ i ] = S [ j ] $ }
\State $ M [ i ] [ j ] = 1 $
\Else
\State $ M [ i ] [ j ] = 0 $
\EndIf
\EndFor
\EndFor
\State \Return $ M $
\EndFunction
\end { algorithmic}
\end { algorithm}
\begin { algorithm}
\caption { Construct the top half of a comparison matrix}
\begin { algorithmic} [1]
\Function { ComparisonMatrix} { $ S $ : Array($ n $ )}
\State $ M \gets $ Array($ n $ ,$ n $ )
\For { ($ i = 0 $ ; $ i < n $ ; $ i + + $ )}
\For { j=i; j < n; j++}
\If { S[i] = S[j]}
\State M[i][j] = 1
\Else
\State M[i][j] = 0
\EndIf
\EndFor
\EndFor
\State \Return M
\EndFunction
\end { algorithmic}
\end { algorithm}
\begin { algorithm}
\caption { Find repetitions (with a set of visited segments)}
\begin { algorithmic} [1]
\Function { FindRepetions} { $ S $ : Array($ n $ )}
\Returns { A list of start and end positions for repeated sequences}
\State $ M = $ \Call { ComparisonMatrix} { S}
\State $ pos = \{ \} $
\State $ visited = \{ \} $
\For { ($ i _ { start } = 0 $ ; $ i _ { start } < n $ ; $ i _ { start } + + $ )}
\For { ($ j _ { start } = i _ { start } + 1 $ ; $ j _ { start } < n $ ; $ j _ { start } + + $ )}
\If { $ M [ i _ { start } ] [ j _ { start } ] = 1 $ and $ ( i _ { start } , j _ { start } ) \notin visited $ }
\State $ i = i _ { start } $
\State $ j = j _ { start } $
\While { $ M [ i ] [ j ] = 1 $ }
\State $ i + + $
\State $ j + + $
\State $ visited = visited \cup \{ ( i, j ) \} $
\EndWhile
\State $ pos = pos \cup \{ ( i _ { start } , i ) , ( j _ { start } ,j ) \} $
\EndIf
\EndFor
\EndFor
\EndFunction
\end { algorithmic}
\end { algorithm}
\begin { algorithm}
\caption { Find repetitions with an exploration of diagonals}
\begin { algorithmic} [1]
\Function { FindRepetions} { $ S $ : Array($ n $ )}
\Returns { A list of start and end positions for repeted sequences}
\State $ M $ = \Call { ComparisonMatrix} { S}
\State $ pos = \{ \} $
\For { ($ diag = 1 $ ; $ diag < n $ ; $ diag + + $ )}
\State $ j = diag $
\State $ i = 0 $
\While { $ i < n $ and $ j < n $ }
\If { $ M [ i ] [ j ] = 1 $ }
\State $ i _ { start } = i $
\State $ j _ { start } = j $
\While { $ i < n $ and $ j < n $ and $ M [ i ] [ j ] = 1 $ }
\State i++
\State j++
\EndWhile
\State $ pos = pos \cup \{ ( ( i _ { start } ,i - 1 ) , ( j _ { start } ,j - 1 ) ) \} $
\EndIf
\State $ i + + $
\State $ j + + $
\State
\EndWhile
\EndFor
\EndFunction
\end { algorithmic}
\end { algorithm}
\begin { algorithm}
\caption { Find repetitions with an exploration of diagonals, without nested while}
\begin { algorithmic} [1]
\Function { FindRepetions} { $ S $ : Array($ n $ )}
\Returns { A list of start positions for repeted sequences and match length}
\State $ M $ = \Call { ComparisonMatrix} { S}
\State $ pos = \{ \} $
\For { ($ diag = 1 $ ; $ diag < n $ ; $ diag + + $ )}
\State $ j = diag $
\State $ i = 0 $
\State $ l = 0 $
\While { $ i < n $ and $ j < n $ }
\If { $ M [ i ] [ j ] = 1 $ }
\State $ l + + $
\Else
\If { $ l > 0 $ }
\State $ pos = pos \cup \{ ( i - l,j - l,l ) \} $
\State $ l = 0 $
\EndIf
\EndIf
\State $ i + + $
\State $ j + + $
\EndWhile
\If { $ l > 0 $ }
\State $ pos = pos \cup \{ ( ( i - l,j - l,l ) ) \} $
\EndIf
\EndFor
\State \Return $ pos $
\EndFunction
\end { algorithmic}
\end { algorithm}
\begin { algorithm}
\caption { Find repetitions}
\begin { algorithmic} [1]
\Function { FindRepetions} { $ S $ : Array($ n $ )}
\Returns { A list of start and end positions for repeted sequences}
\State $ M $ = \Call { ComparisonMatrix} { S}
\State $ pos = \{ \} $
\For { $ i _ { start } = 0 $ ; $ i _ { start } < n $ ; $ i _ { start } + + $ }
\For { $ j _ { start } = i _ { start } + 1 $ ; $ j _ { start } < n $ ; $ j _ { start } + + $ }
\If { $ M [ i _ { start } ] [ j _ { start } ] = 1 $ }
\State $ i = i _ { start } $
\State $ j = j _ { start } $
\While { $ M [ i ] [ j ] = 1 $ }
\State $ M [ i ] [ j ] = 0 $ \Comment { Ensure that the segment is not explored again}
\State $ i + + $
\State $ j + + $
\EndWhile
\State $ pos = pos \cup \{ ( ( i _ { start } , i - 1 ) , ( j _ { start } ,j - 1 ) ) \} $
\EndIf
\EndFor
\EndFor
\EndFunction
\end { algorithmic}
\end { algorithm}
\section { Automata}
An automaton is a tuple $ \langle S, s _ { 0 } , T, \Sigma ,f \rangle $
\begin { itemize}
\item $ S $ the set of states
\item $ s _ { 0 } $ the initial state
\item $ T $ the set of terminal states
\item $ \Sigma $ the alphabet
\item $ f $ the transition function $ f: ( s _ { 1 } , c ) \to s _ { 2 } $
\end { itemize}
\paragraph { Example} Given the language $ L $ on the alphabet $ \Sigma = \{ A, C, T \} $ , $ L = \{ A ^ { * } , CTT, CA ^ { * } \} $
\begin { definition} [Deterministic automaton]
2024-04-02 15:05:56 +02:00
An automaton is deterministic, if for each couple $ ( p, a ) \in S \times \Sigma $ it exists at most a state $ q $ such as $ f ( p, a ) = q $
2024-03-26 11:13:08 +01:00
\end { definition}
\begin { definition} [Complete automaton]
2024-04-02 15:05:56 +02:00
An automaton is complete, if for each couple $ ( p, a ) \in S \times \Sigma $ it exists at least a state $ q $ such as $ f ( p, a ) = q $ .
2024-03-26 11:13:08 +01:00
\end { definition}
\begin { algorithm}
\caption { Check wether a word belong to a language for which we have an automaton}
\begin { algorithmic} [1]
\Function { WordInLanguage} { $ W $ : Array($ n $ ), $ A $ : $ \langle S, s _ { 0 } , T, \Sigma ,f \rangle $ }
\Returns { A Boolean valued to \True { } if the word is recognized by the language automaton}
\State $ s \gets s _ { 0 } $
\State $ i \gets 0 $
\While { $ i < n $ }
\State $ a \gets W [ i ] $
\If { $ \exists f ( s, a ) $ }
\State $ s \gets f ( s, a ) $
\Else
\State \Return \False
\EndIf
\State i++
\EndWhile
\If { $ s \in T $ }
\State \Return \True
\Else
\State \Return \False
\EndIf
\EndFunction
\end { algorithmic}
\end { algorithm}
\subsection { Suffix Automaton}
Let $ S = $ AACTACT
A suffix automata recognize all suffix of a given sequence.
The suffix language of $ S $ is $ \{ S, ACTACT, CTACT, TACT, ACT, CT, T \} $ .
\begin { figure}
\centering
\includegraphics { ./figures/part1/minimal_ suffix_ automaton_ exercise.pdf}
\caption { Suffix automaton for $ S = $ AACTACT}
\end { figure}
\begin { figure}
\centering
\includegraphics { ./figures/part1/minimal_ suffix_ automaton_ exercise_ bis.pdf}
\caption { Suffix automaton for $ S = $ TCATCATT}
\end { figure}
\begin { algorithm}
\caption { Check if a sequences matches a motif, from a suffix automaton $ \mathcal { O } ( m ) $ , built from the automaton}
\begin { algorithmic} [1]
\Function { CheckMotifInSuffixAutomaton} { $ W $ : Array($ m $ ), $ A $ : $ \langle S, s _ { 0 } , T, \Sigma ,f \rangle $ }
\Returns { Boolean valued to \True { } if the motif is in the sequence}
\State $ s \gets s _ { 0 } $
\State $ i \gets 0 $
\While { $ i < m $ and $ \exists f ( s, W [ i ] ) $ }
\State $ s \gets f ( s, W [ i ] ) $
\State $ i + + $
\EndWhile
\If { $ i = n $ }
\State \Return \True
\Else
\State \Return \False
\EndIf
\EndFunction
\end { algorithmic}
\end { algorithm}
The complexity of the pattern matching algorithm is $ \mathcal { O } ( n + m ) $ , because building the automaton is $ \mathcal { O } ( m ) $
\subsection { Automata for motif search}
Let $ M $ be a motif $ M = $ ACAT.
\begin { figure}
\centering
\includegraphics { ./figures/part1/motif_ search_ automaton.pdf}
\caption { Motif search automaton for $ M = $ ACAT}
\end { figure}
The alphabet of motif is the same as the alphabet of the sequence.
The search automaton is complete.
If the there exists a letter $ c $ in the sequence that is not
in the motif alphabet, we can make a virtual transition from
each state to the initial state whenever we encounter an unknown letter.
\begin { algorithm}
\caption { Search a motif in a sequence with an automaton}
\begin { algorithmic} [1]
\Function { SearchMotif} { $ S $ : Array($ n $ ), $ A $ : $ \langle S, s _ { 0 } , T, \Sigma , f \rangle $ , $ P $ : Array($ m $ )}
\Returns { A set of positions where the motif has been found}
\State $ s \gets s _ 0 $
\State $ i \gets 0 $
\State $ pos \gets \{ \} $
\While { $ i < n $ } % $\exists f(s, S[i])$ We assume $S$ and $P$ are formed on the same alphabet, so we could remove the second check, as $A$ is complete
\If { $ s \in T $ }
\State $ pos \gets pos \cup \{ i - m \} $
\EndIf
\State $ s \gets f ( s, S [ i ] ) $
\State $ i + + $
\EndWhile
\State \Return $ pos $
\EndFunction
\end { algorithmic}
\end { algorithm}
\begin { algorithm}
\caption { Check if the a motif automaton recognizes only the prefix of size $ m - 1 $ of a motif $ P $ of size $ m $ }
\begin { algorithmic} [1]
\Function { SearchMotifLastPrefix} { $ S $ : Array($ n $ ), $ A $ : $ \langle S, s _ { 0 } , T, \Sigma , f \rangle $ , $ P $ : Array($ m $ )}
\Returns { A set of positions where the motif has been found}
\State $ s \gets s _ 0 $
\State $ i \gets 0 $
\State $ T _ { new } \gets \{ \} $
\For { $ s \in S $ }
\For { $ a \in \Sigma $ }
\For { $ t \in T $ }
\If { $ \exists f ( s, a ) $ and $ f ( s, a ) = t $ }
\State $ T _ { new } \gets T _ { new } \cup s $
\EndIf
\EndFor
\EndFor
\EndFor
\While { $ i < n $ }
\If { $ s \in T _ { new } $ }
\State \Return \True
\EndIf
\State $ s \gets f ( s, S [ i ] ) $
\State $ i + + $
\EndWhile
\State \Return \False
\EndFunction
\end { algorithmic}
\end { algorithm}
\begin { algorithm}
\caption { Check if the a motif automaton recognizes only the prefix of size $ m - 1 $ of a motif $ P $ of size $ m $ , knowing the sequence of the motif}
\begin { algorithmic} [1]
\Function { SearchMotifLastPrefix} { $ S $ : Array($ n $ ), $ A $ : $ \langle S, s _ { 0 } , T, \Sigma , f \rangle $ , $ P $ : Array($ m $ )}
\Returns { A set of positions where the motif has been found}
\State $ s \gets s _ 0 $
\State $ i \gets 0 $
\While { $ i < n $ and $ f ( s, P [ m - 1 ] ) \notin T $ }
\State $ s \gets f ( s, S [ i ] ) $
\State $ i + + $
\EndWhile
\If { $ f ( s, P [ m - 1 ] ) \in T $ }
\State \Return \True
\Else
\State \Return \False
\EndIf
\EndFunction
\end { algorithmic}
2024-04-02 15:05:56 +02:00
\end { algorithm}