Source code for string2string.similarity.bertscore

"""
    This class contains the original implementation of the BERTScore algorithm by Zhang et al. (2020).

    BERTScore: Evaluating Text Generation with BERT

    @inproceedings{bertscore2020,
        title={BERTScore: Evaluating Text Generation with BERT},
        author={Tianyi Zhang* and Varsha Kishore* and Felix Wu* and Kilian Q. Weinberger and Yoav Artzi},
        booktitle={International Conference on Learning Representations},
        year={2020},
        url={https://openreview.net/forum?id=SkeHuCVFDr}
    }

    Disclaimer:
        This code is adapted from https://github.com/Tiiiger/bert_score
"""

from typing import List, Union, Optional, Tuple

import os
import sys
import time
import pandas as pd
from collections import defaultdict
import torch
from bert_score.utils import (bert_cos_score_idf, get_hash, 
                              get_idf_dict, get_model, get_tokenizer,
                              lang2model, model2layers)


[docs]class BERTScore: """ This class implements the BERTScore algorithm. """
[docs] def __init__(self, model_name_or_path: str = None, lang: str = None, num_layers: int = None, all_layers: bool = False, use_fast_tokenizer: bool = False, device: str = 'cpu', baseline_path: str = None, ) -> None: r""" This function initializes the BERTScore class, which computes the BERTScore between two texts. Arguments: model_name_or_path (str): BERT model type to use (e.g., bert-base-uncased). lang (str): Language of the texts (e.g., en). num_layers (int): Number of layers to use. all_layers (bool): Whether to use all layers use_fast_tokenizer (bool): Whether to use the fast tokenizer. device (str): Device to use (e.g., cpu or cuda). baseline_path (str): Path to the baseline file. Returns: None Raises: ValueError: If model_name_or_path and lang are both None. .. attention:: If you use this class, please make sure to cite the following paper: .. code-block:: latex @inproceedings{bertscore2020, title={BERTScore: Evaluating Text Generation with BERT}, author={Tianyi Zhang* and Varsha Kishore* and Felix Wu* and Kilian Q. Weinberger and Yoav Artzi}, booktitle={International Conference on Learning Representations}, year={2020}, url={https://openreview.net/forum?id=SkeHuCVFDr} } .. note:: * If model_name_or_path is not specified, use the default model for the language. * If num_layers is not specified, use the default number of layers. * If device is not specified, use the GPU if available, otherwise use the CPU. * If baseline_path is not specified, use the default baseline file. """ # Check the arguments if model_name_or_path is None and lang is None: raise ValueError("You must specify either model_name_or_path or lang") # Set the attributes self.model_name_or_path = model_name_or_path self.lang = lang self.num_layers = num_layers self.all_layers = all_layers self.use_fast_tokenizer = use_fast_tokenizer self.baseline_path = baseline_path # If model_name_or_path is not specified, use the default model for the language if self.model_name_or_path is None: self.lang = lang.lower() self.model_name_or_path = lang2model[self.lang] # If num_layers is not specified, use the default number of layers if num_layers is None: self.num_layers = model2layers[self.model_name_or_path] # Set the device self.device = device if self.device is None: self.device = "cuda" if torch.cuda.is_available() else "cpu" # Load model and tokenizer self.tokenizer = get_tokenizer(self.model_name_or_path, self.use_fast_tokenizer) self.model = get_model(self.model_name_or_path, self.num_layers, self.all_layers) self.model.eval() self.model.to(device)
# Compute the BERTScore between source sentences and target sentences
[docs] def compute(self, source_sentences: List[str], target_sentences: Union[List[str], List[List[str]]], batch_size: int = 4, idf: bool = False, nthreads: int = 4, return_hash: bool = False, rescale_with_baseline: bool = False, verbose: bool = False, ) -> Union[dict, Optional[str]]: """ This function scores the source sentences based on their similarity to the target sentences using BERTScore. Arguments: source_sentences (list of str): candidate sentences target_sentences (list of str or list of list of str): reference sentences batch_size (int): bert score processing batch size idf (bool or dict): use idf weighting, can also be a precomputed idf_dict nthreads (int): number of threads return_hash (bool): return hashcode of the setting rescale_with_baseline (bool): rescale bertscore with pre-computed baseline verbose (bool): turn on intermediate status update Returns: (Dict[str, Tensor], Optional[str]): A dictionary containing the precision, recall, and F1 score, and the hashcode (if return_hash is True). where the precision, recall, and F1 score are tensors of shape (len(source_sentences), Raises: ValueError: If the number of source sentences and target sentences do not match. """ # Check the arguments if len(source_sentences) != len(target_sentences): raise ValueError("The number of candidates and references do not match") # If the target sentences are grouped, flatten them ref_group_boundaries = None if not isinstance(target_sentences[0], str): ref_group_boundaries = [] ori_source_sentences, ori_target_sentences = source_sentences, target_sentences source_sentences, target_sentences = [], [] count = 0 for cand, ref_group in zip(ori_source_sentences, ori_target_sentences): source_sentences += [cand] * len(ref_group) target_sentences += ref_group ref_group_boundaries.append((count, count + len(ref_group))) count += len(ref_group) if rescale_with_baseline and self.baseline_path is None: raise ValueError("Need to specify baseline_path when rescaling with baseline") # Get the IDF dict if not idf: idf_dict = defaultdict(lambda: 1.0) # set idf for [SEP] and [CLS] to 0 idf_dict[self.tokenizer.sep_token_id] = 0 idf_dict[self.tokenizer.cls_token_id] = 0 elif isinstance(idf, dict): if verbose: print("using predefined IDF dict...") idf_dict = idf else: if verbose: print("preparing IDF dict...") start = time.perf_counter() idf_dict = get_idf_dict(target_sentences, self.tokenizer, nthreads=nthreads) if verbose: print("done in {:.2f} seconds".format(time.perf_counter() - start)) if verbose: print("calculating scores...") start = time.perf_counter() # Get all the predictions all_preds = bert_cos_score_idf( model = self.model, refs = target_sentences, hyps = source_sentences, tokenizer= self.tokenizer, idf_dict = idf_dict, verbose = verbose, device = self.device, batch_size=batch_size, all_layers=self.all_layers, ).cpu() # If the target sentences are grouped, take the max score if ref_group_boundaries is not None: max_preds = [] for beg, end in ref_group_boundaries: max_preds.append(all_preds[beg:end].max(dim=0)[0]) all_preds = torch.stack(max_preds, dim=0) # Rescale with baseline use_custom_baseline = self.baseline_path is not None if rescale_with_baseline: if self.baseline_path is None: self.baseline_path = os.path.join( os.path.dirname(__file__), f"rescale_baseline/{self.lang}/{self.model_name_or_path}.tsv" ) if os.path.isfile(self.baseline_path): if not self.all_layers: baselines = torch.from_numpy( pd.read_csv(self.baseline_path).iloc[self.num_layers].to_numpy() )[1:].float() else: baselines = ( torch.from_numpy(pd.read_csv(self.baseline_path).to_numpy())[:, 1:] .unsqueeze(1) .float() ) all_preds = (all_preds - baselines) / (1 - baselines) else: print( f"Warning: Baseline not Found for {self.model_name_or_path} on {self.lang} at {self.baseline_path}", file=sys.stderr, ) # Get the final output out = all_preds[..., 0], all_preds[..., 1], all_preds[..., 2] # P, R, F scores = { "precision": out[0].numpy(), "recall": out[1].numpy(), "f1": out[2].numpy(), } # Print the time if verbose: time_diff = time.perf_counter() - start print( f"done in {time_diff:.2f} seconds, {len(target_sentences) / time_diff:.2f} sentences/sec" ) # If return hash, return both the output and the hash if return_hash: return tuple( [ scores, get_hash( self.model_name_or_path, self.num_layers, idf, rescale_with_baseline, use_custom_baseline=use_custom_baseline, use_fast_tokenizer=self.use_fast_tokenizer, ), ] ) # Otherwise, just return the output return scores