Source code for string2string.search.classical

"""
    This module contains the following algorithms:
        (-) Naive search algorithm ++
        (a) Rabin-Karp algorithm ++ 
        (b) Boyer-Moore algorithm ++ 
        (c) Knuth-Morris-Pratt algorithm
        (d) Suffix Tree algorithm
        (e) Suffix Array algorithm
        (f) Suffix Automaton algorithm
        (g) Aho-Corasick algorithm (basis of fgrep/grep in Unix) ++ (not implemented)
        (h) Ukkonen's algorithm -- (not implemented)
        (i) Wu-Manber algorithm ++ (not implemented)
        (j) Z-Algorithm ++ (not implemented)
"""

from typing import List, Union, Tuple, Optional
from string2string.misc.hash_functions import HashFunction, PolynomialRollingHash


# Parent class for all search algorithms
class SearchAlgorithm:
    """
    This class contains the parent class for all search algorithms.
    """

    def __init__(self) -> None:
        """
        This function initializes the abstract class for all search algorithms.

        Returns:
            None
        """
        pass

    def search(self,
        pattern: str,
        text: str,
    ) -> int:
        """
        Searches for the pattern in a text.

        Arguments:
            pattern (str): The pattern to search for.
            text (str): The text to search in.

        Returns:
            int: The index of the pattern in the text.
        """
        pass


[docs]class NaiveSearch(SearchAlgorithm): """ This class contains the naive search algorithm. """
[docs] def __init__(self) -> None: """ Initializes the class. Returns: None """ super().__init__()
[docs] def search(self, pattern: str, text: str, ) -> int: """ Searches for the pattern in the text. Arguments: text (str): The text to search in. Returns: int: The index of the pattern in the text (or -1 if the pattern is not found). Raises: AssertionError: If the inputs are invalid. """ # Check the inputs assert isinstance(pattern, str), 'The pattern must be a string.' assert isinstance(text, str), 'The text must be a string.' # Set the attributes self.pattern = pattern self.pattern_length = len(self.pattern) # Loop over the text for i in range(len(text) - self.pattern_length + 1): # Check if the strings match if text[i:i + self.pattern_length] == self.pattern: return i # Return -1 if the pattern is not found return -1
# Rabin-Karp search algorithm class
[docs]class RabinKarpSearch(SearchAlgorithm): """ This class contains the Rabin-Karp search algorithm. """
[docs] def __init__(self, hash_function: HashFunction = PolynomialRollingHash(), ) -> None: """ This function initializes the Rabin-Karp search algorithm class, which uses a hash function to search for a pattern in a text. [RK1987]_ Arguments: hash_function (HashFunction): The hash function to use. Returns: None Raises: AssertionError: If the inputs are invalid. .. [RK1987] Karp, R.M. and Rabin, M.O., 1987. Efficient randomized pattern-matching algorithms. IBM Journal of Research and Development, 31(2), pp.249-260. """ assert isinstance(hash_function, HashFunction), 'The hash function must be a HashFunction object.' # Set the attributes # self.pattern = pattern self.hash_function = hash_function
# # Compute the hash value of the pattern # self.pattern_hash = self.hash_function.compute(self.pattern) # # Length of the pattern # self.pattern_length = len(self.pattern)
[docs] def itialize_pattern_hash(self, pattern: str, ) -> None: """ This function initializes the pattern hash value. Arguments: pattern (str): The pattern to search for. Returns: None Raises: AssertionError: If the inputs are invalid. """ # Check the inputs assert isinstance(pattern, str), 'The pattern must be a string.' # Reset the hash function self.hash_function.reset() # Set the attributes self.pattern = pattern # Compute the hash value of the pattern self.pattern_hash = self.hash_function.compute(self.pattern) # Length of the pattern self.pattern_length = len(self.pattern)
[docs] def search(self, pattern: str, text: str, ) -> int: """ This function searches for the pattern in the text. Arguments: pattern (str): The pattern to search for. text (str): The text to search in. Returns: int: The index of the pattern in the text (or -1 if the pattern is not found). Raises: AssertionError: If the inputs are invalid. """ # Check the inputs assert isinstance(text, str), 'The text must be a string.' # Initialize the pattern hash self.itialize_pattern_hash(pattern) # Reset the hash function (in case it was used before) [Important!] self.hash_function.reset() # Compute the hash value of the first window window_hash = self.hash_function.compute(text[:self.pattern_length]) # Loop over the text for i in range(len(text) - self.pattern_length + 1): # print('Window hash: {}'.format(window_hash)) # Check if the hash values match if window_hash == self.pattern_hash: # print('Hash values match at index {}.'.format(i)) j = 0 # Check if the strings match while text[i + j] == self.pattern[j]: j += 1 if j == self.pattern_length: return i # Update the hash value of the window if i < len(text) - self.pattern_length: window_hash = self.hash_function.update(text[i], text[i + self.pattern_length], self.pattern_length) # Return -1 if the pattern is not found return -1
# Knuth-Morris-Pratt (KMP) search algorithm class
[docs]class KMPSearch(SearchAlgorithm): """ This class contains the KMP search algorithm. """
[docs] def __init__(self) -> None: r""" This function initializes the Knuth-Morris-Pratt (KMP) search algorithm class. [KMP1977]_ Arguments: None Returns: None .. note:: * The current version of the KMP algorithm utilizes an auxiliary list called the lps_array, which stands for "longest proper prefix which is also a suffix". The lps_array is a list of integers where lps_array[i] represents the length of the longest proper prefix of the pattern that is also a suffix of the pattern[:i+1]. * By precomputing the lps_array, the KMP algorithm avoids unnecessary character comparisons while searching for the pattern in the text. The algorithm scans the text from left to right and compares characters in the pattern with characters in the text. When a mismatch occurs, the algorithm uses the values in the lps_array to determine the next character in the pattern to compare with the text. * An alternative implementation of the KMP algorithm exists, which uses a finite state automaton (FSA) instead of the lps_array, but this is not implemented in this version of the package. .. [KMP1977] Knuth, D.E., Morris, J.H. and Pratt, V.R., 1977. Fast pattern matching in strings. SIAM journal on computing, 6(2), pp.323-350. """ super().__init__()
# Initialize_lps function
[docs] def initialize_lps(self) -> None: r""" This function initializes the pongest proper prefix suffix (lps) array, which contains the length of the longest proper prefix that is also a suffix of the pattern. IOW: For each index i in the lps array, lps[i] is the length of the longest proper prefix that is also a suffix of the pattern[:i + 1]. In other words, if k = lps[i], then pattern[:k] is equal to pattern[i - k + 1:i + 1] (with the condition that pattern[:k+1] is not equal to pattern[i - k:i + 1]). The lps array is used in the Knuth-Morris-Pratt (KMP) algorithm to avoid unnecessary comparisons when searching for a pattern in a text. Arguments: pattern (str): The pattern to search for. Returns: None """ # Initialize the list of longest proper prefix which is also a suffix self.lps = [0] * self.pattern_length # Loop over the pattern i = 1 # denotes the index of the character in the pattern j = 0 # denotes the length of the longest proper prefix which is also a suffix of the pattern[:i] while i < self.pattern_length: # Check if the characters match if self.pattern[i] == self.pattern[j]: j += 1 self.lps[i] = j i += 1 else: if j != 0: j = self.lps[j - 1] else: self.lps[i] = 0 i += 1
# Search for the pattern in the text
[docs] def search(self, pattern: str, text: str, ) -> int: """ This function searches for the pattern in the text. Arguments: pattern (str): The pattern to search for. text (str): The text to search in. Returns: int: The index of the pattern in the text (or -1 if the pattern is not found) Raises: AssertionError: If the text is not a string. .. note:: * This is the main function of the KMP search algorithm class. """ # Check the inputs assert isinstance(text, str), 'The text must be a string.' # Set the attributes self.pattern = pattern self.pattern_length = len(self.pattern) # Initialize the lps array self.initialize_lps() # Loop over the text i = 0 j = 0 while i < len(text): # Check if the characters match if self.pattern[j] == text[i]: i += 1 j += 1 # Check if the pattern is found if j == self.pattern_length: return i - j # Check if the characters do not match elif i < len(text) and self.pattern[j] != text[i]: if j != 0: j = self.lps[j - 1] else: i += 1 # Return -1 if the pattern is not found return -1
# Boyer-Moore search algorithm class
[docs]class BoyerMooreSearch: """ This class contains the Boyer-Moore search algorithm. """
[docs] def __init__(self) -> None: """ This function initializes the Boyer-Moore search algorithm class. [BM1977]_ The Bayer-Moore search algorithm is a string searching algorithm that uses a heuristic to skip over large sections of the search string, resulting in faster search times than traditional algorithms such as brute-force or Knuth-Morris-Pratt. It is particularly useful for searching for patterns in large amounts of text. .. [BM1977] Boyer, RS and Moore, JS. "A fast string searching algorithm." Communications of the ACM 20.10 (1977): 762-772. A Correct Preprocessing Algorithm for Boyer–Moore String-Searching https://www.cs.jhu.edu/~langmea/resources/lecture_notes/strings_matching_boyer_moore.pdf """ super().__init__()
# This is what we call the "prefix - suffix" match case of the good suffix rule
[docs] def aux_get_suffix_prefix_length(self, i: int, ) -> int: """ This auxiliary function is used to compute the length of the longest suffix of pattern[i:] that matches a "prefix" of the pattern. Arguments: i (int): The index of the suffix. Returns: int: The length of the longest suffix of pattern[i:] that matches a "prefix" of the pattern. """ # pattern [ ....... i ................j] # Initialize j to the end of the pattern j = self.pattern_length - 1 # pattern [ ....... i ....... j .......] # Move j to the left until we find a mismatch or until j == i while j >= i and self.pattern[j] == self.pattern[j - i]: # pattern [ ... j-i ..... i ... j .......] j -= 1 return self.pattern_length - (j - 1)
# This is what we call the "substring match" case of the good suffix rule
[docs] def aux_get_matching_substring_length(self, j: int, ) -> int: """ This auxilary function is used to compute the length of the longess suffix of the patterm that matches a substring of the pattern that ends at the index j. It is used in the "substring match" case of the good suffix rule. More specifically, it is used to find when the suffix of the pattern does not match the text at all. Hence, we find the longest suffix of the pattern that matches a substring of the pattern that ends at the index j. Arguments: j (int): The end index of the substring. Returns: int: The length of the longess suffix of the patterm that matches a substring of the pattern that ends at the index j. """ # Loop over the suffixes of the pattern for i in range(j, -1, -1): # Check if the substring matches the suffix if self.pattern[i:i+(j+1)] == self.pattern[self.pattern_length-(j+1):]: return j - i + 1 # Otherwise, if we get here, the substring does not match any suffix of the pattern return 0
# Creates the "good suffix" skip table
[docs] def create_skip_gs(self) -> None: """ This function creates the "good suffix" skip table. (It is used in the preprocessing step of the Boyer-Moore search algorithm.) Arguments: None Returns: None """ # Create the good suffix "skip" table # TODO(msuzgun): Has an error! self.skip_gs = [0] * self.pattern_length # skip_gs[i] denotes the number of cells to the right we need to skip if the current character is the i-th character of the pattern # First, we compute the length of the longest suffix of pattern [i:] that matches a prefix of the pattern for i in range(self.pattern_length - 1): self.skip_gs[i] = self.aux_get_suffix_prefix_length(i) # Set the default skip value to the pattern length self.skip_gs[-1] = 1 # Second, we compute the length of the longest suffix of the pattern that matches a substring of the pattern that ends at the index j for j in range(self.pattern_length - 2): k = (self.pattern_length - 1) - self.aux_get_matching_substring_length(j) if self.skip_gs[k] == 0: self.skip_gs[k] = self.pattern_length - 1 - j
# Creates the "bad character" skip table
[docs] def create_skip_bc(self) -> None: """ This function creates the "bad character" skip table. (It is used in the preprocessing step of the Boyer-Moore search algorithm.) Arguments: None Returns: None """ # Create the bad character "skip" table self.last_occurence = {} # last_occurence[c] denotes the index of the last occurence of the character c in the pattern for j in range(self.pattern_length - 1): self.last_occurence[self.pattern[j]] = j # Set the default skip value to the pattern length self.last_occurence.setdefault(None, self.pattern_length)
# Searches for the pattern in the text using the Boyer-Moore algorithm
[docs] def search(self, pattern: str, text: str, ) -> int: """ This function searches for the pattern in the text using the Boyer-Moore algorithm. Arguments: pattern (str): The pattern to search for. text (str): The text to search in. Returns: int: The index of the pattern in the text (or -1 if the pattern is not found) Raises: AssertionError: If the text or the pattern is not a string. """ # Check both the pattern and the text assert isinstance(pattern, str), 'The pattern must be a string.' assert isinstance(text, str), 'The text must be a string.' # Set the attributes self.pattern = pattern # Length of the pattern self.pattern_length = len(self.pattern) # Preprocess the pattern by creating the skip tables for the bad character and good suffix rules, respectively. self.create_skip_bc() self.create_skip_gs() # Loop over the text i = 0 while i <= len(text) - self.pattern_length: # Loop over the pattern j = self.pattern_length - 1 while j >= 0 and text[i + j] == self.pattern[j]: j -= 1 # Check if the pattern is found if j < 0: return i # Update i i += max(j - self.last_occurence.get(text[i + j], self.pattern_length), 1) # Return -1 if the pattern is not found return -1
# import time # import random # import string # for _ in range(100): # # Create a Boyer-Moore object # # Generate a random string of length (1, 10) # pattern = ''.join(random.choice(['A', 'C', 'G', 'T']) for _ in range(random.randint(1, 5))) # # pattern = 'AA' # bm = BoyerMooreSearch(pattern) # # Create a Naive object # naive = NaiveSearch() # # Create a Rabin-Karp object # rk = RabinKarpSearch() #pattern) # # Create a KMP object # kmp = KMPSearch() # # Search for the pattern in the text # # Generate a random string of length (11, 100) # text = ''.join(random.choice(['A', 'C', 'G', 'T']) for _ in range(random.randint(11, 10000))) # # text = 'AAAAAAAAA' # # # Measure time # # start = time.time() # # print(bm.search(text)) # # end = time.time() # # # print(end - start) # # start = time.time() # # rk_result = rk.search(text) # # end = time.time() # # # print(end - start) # # start = time.time() # # naive_result = naive.search(text) # # end = time.time() # # print(end - start) # naive_result = naive.search(pattern=pattern, text=text) # rk_result = rk.search(pattern=pattern, text=text) # kmp_result = kmp.search(pattern=pattern, text=text) # bm_result = bm.search(text) # print(naive_result, rk_result, bm_result, kmp_result) # # Check if the results are the same # assert naive_result == rk_result == bm_result == kmp_result, 'The results are not the same!'