Source code for string2string.similarity.classical

"""
This module contains the classes for the similarity metrics and functions.
"""


from typing import List, Union, Tuple, Optional
import numpy as np

# # Import the LongestCommonSubsequence class
from string2string.alignment.classical import LongestCommonSubsequence, LongestCommonSubstring

# Longest Common Subsequence based similarity class
[docs]class LCSubsequenceSimilarity(LongestCommonSubsequence): """ This class contains the Longest Common Subsequence similarity metric. This class inherits from the LongestCommonSubsequence class. """
[docs] def __init__(self): super().__init__()
[docs] def compute(self, str1: Union[str, List[str]], str2: Union[str, List[str]], denominator: str = 'max', ) -> float: """ Returns the LCS-similarity between two strings. Arguments: str1 (Union[str, List[str]]): The first string or list of strings. str2 (Union[str, List[str]]): The second string or list of strings. denominator (str): The denominator to use. Options are 'max' and 'sum'. Default is 'max'. Returns: float: The similarity between the two strings. Raises: ValueError: If the denominator is invalid. """ # Get the numerator numerator, _ = super().compute(str1, str2) if denominator == 'max': return (numerator / max(len(str1), len(str2))) elif denominator == 'sum': return (2. * numerator / (len(str1) + len(str2))) else: raise ValueError('Invalid denominator.')
# Longest Common Substring based similarity class
[docs]class LCSubstringSimilarity(LongestCommonSubstring): """ This class contains the Longest Common Substring similarity metric. This class inherits from the LongestCommonSubstring class. """
[docs] def __init__(self): super().__init__()
[docs] def compute(self, str1: Union[str, List[str]], str2: Union[str, List[str]], denominator: str = 'max', ) -> float: """ Returns the LCS-similarity between two strings. Arguments: str1 (Union[str, List[str]]): The first string or list of strings. str2 (Union[str, List[str]]): The second string or list of strings. denominator (str): The denominator to use. Options are 'max' and 'sum'. Default is 'max'. Returns: float: The similarity between the two strings. Raises: ValueError: If the denominator is invalid. """ # Get the numerator numerator, _ = super().compute(str1, str2) if denominator == 'max': return (numerator / max(len(str1), len(str2))) elif denominator == 'sum': return (2. * numerator / (len(str1) + len(str2))) else: raise ValueError('Invalid denominator.')
# Jaro similarity class
[docs]class JaroSimilarity: """ This class contains the Jaro similarity metric. """
[docs] def __init__(self): pass
[docs] def compute(self, str1: Union[str, List[str]], str2: Union[str, List[str]], ) -> float: """ This function returns the Jaro similarity between two strings. Arguments: str1 (Union[str, List[str]]): The first string or list of strings. str2 (Union[str, List[str]]): The second string or list of strings. Returns: float: The Jaro similarity between the two strings. """ # Get the length of the strings len1 = len(str1) len2 = len(str2) # Get the maximum distance, which we denote by k k = max(len1, len2) // 2 - 1 # Initialize the number of matching characters and the number of transpositions num_matches = 0 num_transpositions = 0 # Initialize the list of matching flags for the strings matches1 = [False] * len1 matches2 = [False] * len2 # Loop through the characters in the first string and find the matching characters for i in range(len1): # Get the lower and upper bounds for the search lower_bound = max(0, i - k) upper_bound = min(len2, i + k + 1) # Loop through the characters in the second string for j in range(lower_bound, upper_bound): # Check if the characters match if not matches2[j] and str1[i] == str2[j]: # Increment the number of matches num_matches += 1 # Set the matching flags matches1[i] = True matches2[j] = True # Break out of the loop break # Check if there are no matches if num_matches == 0: return 0. # Loop through again but this time find the number of transpositions # That is, the number of times where there are two matching characters but there is another "matched" character in between them moving_index = 0 for i in range(len1): # Check if the character is a match if matches1[i]: # Find the next match for j in range(moving_index, len2): # Check if the character is a match if matches2[j]: # Set the moving index moving_index = j + 1 # Check if the characters are not in the right order if str1[i] != str2[j]: # Increment the number of transpositions num_transpositions += 1 # Break out of the loop break num_transpositions = num_transpositions // 2 # Return the Jaro similarity return (num_matches / len1 + num_matches / len2 + (num_matches - num_transpositions) / num_matches) / 3.0