Source code for string2string.search.classical

"""
    This module contains the following algorithms:
        (-) Naive search algorithm ++
        (a) Rabin-Karp algorithm ++ 
        (b) Boyer-Moore algorithm ++ 
        (c) Knuth-Morris-Pratt algorithm
        (d) Suffix Tree algorithm
        (e) Suffix Array algorithm
        (f) Suffix Automaton algorithm
        (g) Aho-Corasick algorithm (basis of fgrep/grep in Unix) ++ (not implemented)
        (h) Ukkonen's algorithm -- (not implemented)
        (i) Wu-Manber algorithm ++ (not implemented)
        (j) Z-Algorithm ++ (not implemented)
"""

from typing import List, Union, Tuple, Optional
from string2string.misc.hash_functions import HashFunction, PolynomialRollingHash


# Parent class for all search algorithms
class SearchAlgorithm:
    """
    This class contains the parent class for all search algorithms.
    """

    def __init__(self) -> None:
        """
        This function initializes the abstract class for all search algorithms.

        Returns:
            None
        """
        pass

    def search(self,
        pattern: str,
        text: str,
    ) -> int:
        """
        Searches for the pattern in a text.

        Arguments:
            pattern (str): The pattern to search for.
            text (str): The text to search in.

        Returns:
            int: The index of the pattern in the text.
        """
        pass


[docs]class NaiveSearch(SearchAlgorithm):
    """
    This class contains the naive search algorithm.
    """

[docs]    def __init__(self) -> None:
        """
        Initializes the class.

        Returns:
            None
        """
        super().__init__()


[docs]    def search(self,
        pattern: str,
        text: str,
    ) -> int:
        """
        Searches for the pattern in the text.

        Arguments:
            text (str): The text to search in.

        Returns:
            int: The index of the pattern in the text (or -1 if the pattern is not found).

        Raises:
            AssertionError: If the inputs are invalid.
        """
        # Check the inputs
        assert isinstance(pattern, str), 'The pattern must be a string.'
        assert isinstance(text, str), 'The text must be a string.'

        # Set the attributes
        self.pattern = pattern
        self.pattern_length = len(self.pattern)

        # Loop over the text
        for i in range(len(text) - self.pattern_length + 1):
            # Check if the strings match
            if text[i:i + self.pattern_length] == self.pattern:
                return i
            
        # Return -1 if the pattern is not found
        return -1


# Rabin-Karp search algorithm class
[docs]class RabinKarpSearch(SearchAlgorithm):
    """
    This class contains the Rabin-Karp search algorithm.
    """

[docs]    def __init__(self,
        hash_function: HashFunction = PolynomialRollingHash(),
    ) -> None:
        """
        This function initializes the Rabin-Karp search algorithm class, which uses a hash function to search for a pattern in a text. [RK1987]_

        Arguments:
            hash_function (HashFunction): The hash function to use.

        Returns:
            None

        Raises:
            AssertionError: If the inputs are invalid. 

        .. [RK1987] Karp, R.M. and Rabin, M.O., 1987. Efficient randomized pattern-matching algorithms. IBM Journal of Research and Development, 31(2), pp.249-260.
        """
        assert isinstance(hash_function, HashFunction), 'The hash function must be a HashFunction object.'

        # Set the attributes
        # self.pattern = pattern
        self.hash_function = hash_function

        # # Compute the hash value of the pattern
        # self.pattern_hash = self.hash_function.compute(self.pattern)

        # # Length of the pattern
        # self.pattern_length = len(self.pattern)

[docs]    def itialize_pattern_hash(self,
        pattern: str,
    ) -> None:
        """
        This function initializes the pattern hash value.

        Arguments:
            pattern (str): The pattern to search for.

        Returns:
            None

        Raises:
            AssertionError: If the inputs are invalid.
        """
        # Check the inputs
        assert isinstance(pattern, str), 'The pattern must be a string.'

        # Reset the hash function
        self.hash_function.reset()

        # Set the attributes
        self.pattern = pattern

        # Compute the hash value of the pattern
        self.pattern_hash = self.hash_function.compute(self.pattern)

        # Length of the pattern
        self.pattern_length = len(self.pattern)


[docs]    def search(self,
        pattern: str,
        text: str,
    ) -> int:
        """
        This function searches for the pattern in the text.

        Arguments:
            pattern (str): The pattern to search for.
            text (str): The text to search in.

        Returns:
            int: The index of the pattern in the text (or -1 if the pattern is not found).

        Raises:
            AssertionError: If the inputs are invalid.

        
        """
        # Check the inputs
        assert isinstance(text, str), 'The text must be a string.'

        # Initialize the pattern hash
        self.itialize_pattern_hash(pattern)

        # Reset the hash function (in case it was used before) [Important!]
        self.hash_function.reset()

        # Compute the hash value of the first window
        window_hash = self.hash_function.compute(text[:self.pattern_length])

        # Loop over the text
        for i in range(len(text) - self.pattern_length + 1):
            # print('Window hash: {}'.format(window_hash))

            # Check if the hash values match
            if window_hash == self.pattern_hash:
                # print('Hash values match at index {}.'.format(i))
                j = 0
                # Check if the strings match
                while text[i + j] == self.pattern[j]:
                    j += 1
                    if j == self.pattern_length:
                        return i
            # Update the hash value of the window
            if i < len(text) - self.pattern_length:
                window_hash = self.hash_function.update(text[i], text[i + self.pattern_length], self.pattern_length)

        # Return -1 if the pattern is not found
        return -1
    

# Knuth-Morris-Pratt (KMP) search algorithm class
[docs]class KMPSearch(SearchAlgorithm):
    """
    This class contains the KMP search algorithm.
    """
    
[docs]    def __init__(self) -> None:
       r"""
        This function initializes the Knuth-Morris-Pratt (KMP) search algorithm class. [KMP1977]_

        Arguments:
            None

        Returns:
            None

        .. note::
            * The current version of the KMP algorithm utilizes an auxiliary list called the lps_array, which stands for "longest proper prefix which is also a suffix". The lps_array is a list of integers where lps_array[i] represents the length of the longest proper prefix of the pattern that is also a suffix of the pattern[:i+1].
            * By precomputing the lps_array, the KMP algorithm avoids unnecessary character comparisons while searching for the pattern in the text. The algorithm scans the text from left to right and compares characters in the pattern with characters in the text. When a mismatch occurs, the algorithm uses the values in the lps_array to determine the next character in the pattern to compare with the text.
            * An alternative implementation of the KMP algorithm exists, which uses a finite state automaton (FSA) instead of the lps_array, but this is not implemented in this version of the package.

        .. [KMP1977] Knuth, D.E., Morris, J.H. and Pratt, V.R., 1977. Fast pattern matching in strings. SIAM journal on computing, 6(2), pp.323-350.
        """
       super().__init__()

    # Initialize_lps function
[docs]    def initialize_lps(self) -> None:
        r"""
        This function initializes the pongest proper prefix suffix (lps) array, which contains the length of the longest proper prefix that is also a suffix of the pattern.

        IOW: For each index i in the lps array, lps[i] is the length of the longest proper prefix that is also a suffix of the pattern[:i + 1]. In other words, if k = lps[i], then pattern[:k] is equal to pattern[i - k + 1:i + 1] (with the condition that pattern[:k+1] is not equal to pattern[i - k:i + 1]). The lps array is used in the Knuth-Morris-Pratt (KMP) algorithm to avoid unnecessary comparisons when searching for a pattern in a text.

        Arguments:
            pattern (str): The pattern to search for.

        Returns:
            None
        """
        # Initialize the list of longest proper prefix which is also a suffix
        self.lps = [0] * self.pattern_length

        # Loop over the pattern
        i = 1 # denotes the index of the character in the pattern
        j = 0 # denotes the length of the longest proper prefix which is also a suffix of the pattern[:i]
        while i < self.pattern_length:
            # Check if the characters match
            if self.pattern[i] == self.pattern[j]:
                j += 1
                self.lps[i] = j
                i += 1
            else:
                if j != 0: 
                    j = self.lps[j - 1]
                else:
                    self.lps[i] = 0
                    i += 1

    # Search for the pattern in the text
[docs]    def search(self,
        pattern: str,
        text: str,
    ) -> int:
        """
        This function searches for the pattern in the text.

        Arguments:
            pattern (str): The pattern to search for.
            text (str): The text to search in.

        Returns:
            int: The index of the pattern in the text (or -1 if the pattern is not found)

        Raises:
            AssertionError: If the text is not a string.

        .. note::
            * This is the main function of the KMP search algorithm class.
        """
        # Check the inputs
        assert isinstance(text, str), 'The text must be a string.'

        # Set the attributes
        self.pattern = pattern
        self.pattern_length = len(self.pattern)

        # Initialize the lps array
        self.initialize_lps()

        # Loop over the text
        i = 0
        j = 0
        while i < len(text):
            # Check if the characters match
            if self.pattern[j] == text[i]:
                i += 1
                j += 1
            # Check if the pattern is found
            if j == self.pattern_length:
                return i - j
            # Check if the characters do not match
            elif i < len(text) and self.pattern[j] != text[i]:
                if j != 0:
                    j = self.lps[j - 1]
                else:
                    i += 1

        # Return -1 if the pattern is not found
        return -1
    


# Boyer-Moore search algorithm class
[docs]class BoyerMooreSearch:
    """
    This class contains the Boyer-Moore search algorithm.
    """

[docs]    def __init__(self) -> None:
        """
        This function initializes the Boyer-Moore search algorithm class. [BM1977]_

        The Bayer-Moore search algorithm is a string searching algorithm that uses a heuristic to skip over large sections of the search string, resulting in faster search times than traditional algorithms such as brute-force or Knuth-Morris-Pratt. It is particularly useful for searching for patterns in large amounts of text.

        .. [BM1977] Boyer, RS and Moore, JS. "A fast string searching algorithm." Communications of the ACM 20.10 (1977): 762-772.
            
        A Correct Preprocessing Algorithm for Boyer–Moore String-Searching

        https://www.cs.jhu.edu/~langmea/resources/lecture_notes/strings_matching_boyer_moore.pdf

        """
        super().__init__()

    
    # This is what we call the "prefix - suffix" match case of the good suffix rule
[docs]    def aux_get_suffix_prefix_length(self,
        i: int,
    ) -> int:
        """
        This auxiliary function is used to compute the length of the longest suffix of pattern[i:] that matches a "prefix" of the pattern.

        Arguments:
            i (int): The index of the suffix.

        Returns:
            int: The length of the longest suffix of pattern[i:] that matches a "prefix" of the pattern.
        """
        
        # pattern [ ....... i ................j]
        # Initialize j to the end of the pattern
        j = self.pattern_length - 1
        
        # pattern [ ....... i ....... j .......]
        # Move j to the left until we find a mismatch or until j == i
        while j >= i and self.pattern[j] == self.pattern[j - i]:
            # pattern [ ... j-i ..... i ... j .......]
            j -= 1
        
        return self.pattern_length - (j - 1)
    

    # This is what we call the "substring match" case of the good suffix rule
[docs]    def aux_get_matching_substring_length(self,
        j: int,
    ) -> int:
        """
        This auxilary function is used to compute the length of the longess suffix of the patterm that matches a substring of the pattern that ends at the index j. 

        It is used in the "substring match" case of the good suffix rule. More specifically, it is used to find when the suffix of the pattern does not match the text at all. Hence, we find the longest suffix of the pattern that matches a substring of the pattern that ends at the index j.

        Arguments:
            j (int): The end index of the substring.

        Returns:
            int: The length of the longess suffix of the patterm that matches a substring of the pattern that ends at the index j.

        """
        # Loop over the suffixes of the pattern
        for i in range(j, -1, -1):
            # Check if the substring matches the suffix
            if self.pattern[i:i+(j+1)] == self.pattern[self.pattern_length-(j+1):]:
                return j - i + 1
        # Otherwise, if we get here, the substring does not match any suffix of the pattern
        return 0
    

    # Creates the "good suffix" skip table
[docs]    def create_skip_gs(self) -> None:
        """
        This function creates the "good suffix" skip table. (It is used in the preprocessing step of the Boyer-Moore search algorithm.)

        Arguments:
            None

        Returns:
            None

        """
        # Create the good suffix "skip" table
        # TODO(msuzgun): Has an error!
        self.skip_gs = [0] * self.pattern_length
        # skip_gs[i] denotes the number of cells to the right we need to skip if the current character is the i-th character of the pattern
        
        # First, we compute the length of the longest suffix of pattern [i:] that matches a prefix of the pattern
        for i in range(self.pattern_length - 1):
            self.skip_gs[i] = self.aux_get_suffix_prefix_length(i)

        # Set the default skip value to the pattern length
        self.skip_gs[-1] = 1

        # Second, we compute the length of the longest suffix of the pattern that matches a substring of the pattern that ends at the index j
        for j in range(self.pattern_length - 2):
            k = (self.pattern_length - 1) - self.aux_get_matching_substring_length(j)
            if self.skip_gs[k] == 0:
                self.skip_gs[k] = self.pattern_length - 1 - j


    # Creates the "bad character" skip table
[docs]    def create_skip_bc(self) -> None:
        """
        This function creates the "bad character" skip table. (It is used in the preprocessing step of the Boyer-Moore search algorithm.)

        Arguments:
            None

        Returns:
            None
        """
        # Create the bad character "skip" table
        self.last_occurence = {}

        # last_occurence[c] denotes the index of the last occurence of the character c in the pattern
        for j in range(self.pattern_length - 1):
            self.last_occurence[self.pattern[j]] = j

        # Set the default skip value to the pattern length
        self.last_occurence.setdefault(None, self.pattern_length)

    
    # Searches for the pattern in the text using the Boyer-Moore algorithm
[docs]    def search(self,
        pattern: str,
        text: str,
    ) -> int:
        """
        This function searches for the pattern in the text using the Boyer-Moore algorithm.

        Arguments:
            pattern (str): The pattern to search for.
            text (str): The text to search in.

        Returns:
            int: The index of the pattern in the text (or -1 if the pattern is not found)

        Raises:
            AssertionError: If the text or the pattern is not a string.
        """
        # Check both the pattern and the text
        assert isinstance(pattern, str), 'The pattern must be a string.'
        assert isinstance(text, str), 'The text must be a string.'

        # Set the attributes
        self.pattern = pattern

        # Length of the pattern
        self.pattern_length = len(self.pattern)

        # Preprocess the pattern by creating the skip tables for the bad character and good suffix rules, respectively.
        self.create_skip_bc()
        self.create_skip_gs()


        # Loop over the text
        i = 0
        while i <= len(text) - self.pattern_length:
            # Loop over the pattern
            j = self.pattern_length - 1
            while j >= 0 and text[i + j] == self.pattern[j]:
                j -= 1
            # Check if the pattern is found
            if j < 0:
                return i
            # Update i
            i += max(j - self.last_occurence.get(text[i + j], self.pattern_length), 1)

        # Return -1 if the pattern is not found
        return -1



# import time
# import random
# import string

# for _ in range(100):

#     # Create a Boyer-Moore object
#     # Generate a random string of length (1, 10)
#     pattern = ''.join(random.choice(['A', 'C', 'G', 'T']) for _ in range(random.randint(1, 5)))
#     # pattern = 'AA'
#     bm = BoyerMooreSearch(pattern)

#     # Create a Naive object
#     naive = NaiveSearch()

#     # Create a Rabin-Karp object
#     rk = RabinKarpSearch() #pattern)

#     # Create a KMP object
#     kmp = KMPSearch()

#     # Search for the pattern in the text
#     # Generate a random string of length (11, 100)
#     text = ''.join(random.choice(['A', 'C', 'G', 'T']) for _ in range(random.randint(11, 10000)))
#     # text = 'AAAAAAAAA'

#     # # Measure time
#     # start = time.time()
#     # print(bm.search(text))
#     # end = time.time()
#     # # print(end - start)

#     # start = time.time()
#     # rk_result = rk.search(text)
#     # end = time.time()
#     # # print(end - start)

#     # start = time.time()
#     # naive_result = naive.search(text)
#     # end = time.time()
#     # print(end - start)
#     naive_result = naive.search(pattern=pattern, text=text)
#     rk_result = rk.search(pattern=pattern, text=text)
#     kmp_result = kmp.search(pattern=pattern, text=text)
#     bm_result = bm.search(text)

#     print(naive_result, rk_result, bm_result, kmp_result)

#     # Check if the results are the same
#     assert naive_result == rk_result == bm_result == kmp_result, 'The results are not the same!'