"""
This module contains the classes for the similarity metrics and functions.
"""
from typing import List, Union, Tuple, Optional
import numpy as np
# # Import the LongestCommonSubsequence class
from string2string.alignment.classical import LongestCommonSubsequence, LongestCommonSubstring
# Longest Common Subsequence based similarity class
[docs]class LCSubsequenceSimilarity(LongestCommonSubsequence):
"""
This class contains the Longest Common Subsequence similarity metric.
This class inherits from the LongestCommonSubsequence class.
"""
[docs] def __init__(self):
super().__init__()
[docs] def compute(self,
str1: Union[str, List[str]],
str2: Union[str, List[str]],
denominator: str = 'max',
) -> float:
"""
Returns the LCS-similarity between two strings.
Arguments:
str1 (Union[str, List[str]]): The first string or list of strings.
str2 (Union[str, List[str]]): The second string or list of strings.
denominator (str): The denominator to use. Options are 'max' and 'sum'. Default is 'max'.
Returns:
float: The similarity between the two strings.
Raises:
ValueError: If the denominator is invalid.
"""
# Get the numerator
numerator, _ = super().compute(str1, str2)
if denominator == 'max':
return (numerator / max(len(str1), len(str2)))
elif denominator == 'sum':
return (2. * numerator / (len(str1) + len(str2)))
else:
raise ValueError('Invalid denominator.')
# Longest Common Substring based similarity class
[docs]class LCSubstringSimilarity(LongestCommonSubstring):
"""
This class contains the Longest Common Substring similarity metric.
This class inherits from the LongestCommonSubstring class.
"""
[docs] def __init__(self):
super().__init__()
[docs] def compute(self,
str1: Union[str, List[str]],
str2: Union[str, List[str]],
denominator: str = 'max',
) -> float:
"""
Returns the LCS-similarity between two strings.
Arguments:
str1 (Union[str, List[str]]): The first string or list of strings.
str2 (Union[str, List[str]]): The second string or list of strings.
denominator (str): The denominator to use. Options are 'max' and 'sum'. Default is 'max'.
Returns:
float: The similarity between the two strings.
Raises:
ValueError: If the denominator is invalid.
"""
# Get the numerator
numerator, _ = super().compute(str1, str2)
if denominator == 'max':
return (numerator / max(len(str1), len(str2)))
elif denominator == 'sum':
return (2. * numerator / (len(str1) + len(str2)))
else:
raise ValueError('Invalid denominator.')
# Jaro similarity class
[docs]class JaroSimilarity:
"""
This class contains the Jaro similarity metric.
"""
[docs] def __init__(self):
pass
[docs] def compute(self,
str1: Union[str, List[str]],
str2: Union[str, List[str]],
) -> float:
"""
This function returns the Jaro similarity between two strings.
Arguments:
str1 (Union[str, List[str]]): The first string or list of strings.
str2 (Union[str, List[str]]): The second string or list of strings.
Returns:
float: The Jaro similarity between the two strings.
"""
# Get the length of the strings
len1 = len(str1)
len2 = len(str2)
# Get the maximum distance, which we denote by k
k = max(len1, len2) // 2 - 1
# Initialize the number of matching characters and the number of transpositions
num_matches = 0
num_transpositions = 0
# Initialize the list of matching flags for the strings
matches1 = [False] * len1
matches2 = [False] * len2
# Loop through the characters in the first string and find the matching characters
for i in range(len1):
# Get the lower and upper bounds for the search
lower_bound = max(0, i - k)
upper_bound = min(len2, i + k + 1)
# Loop through the characters in the second string
for j in range(lower_bound, upper_bound):
# Check if the characters match
if not matches2[j] and str1[i] == str2[j]:
# Increment the number of matches
num_matches += 1
# Set the matching flags
matches1[i] = True
matches2[j] = True
# Break out of the loop
break
# Check if there are no matches
if num_matches == 0:
return 0.
# Loop through again but this time find the number of transpositions
# That is, the number of times where there are two matching characters but there is another "matched" character in between them
moving_index = 0
for i in range(len1):
# Check if the character is a match
if matches1[i]:
# Find the next match
for j in range(moving_index, len2):
# Check if the character is a match
if matches2[j]:
# Set the moving index
moving_index = j + 1
# Check if the characters are not in the right order
if str1[i] != str2[j]:
# Increment the number of transpositions
num_transpositions += 1
# Break out of the loop
break
num_transpositions = num_transpositions // 2
# Return the Jaro similarity
return (num_matches / len1 + num_matches / len2 + (num_matches - num_transpositions) / num_matches) / 3.0