Source code for string2string.misc.word_embeddings

"""
This module implements the word embeddings class.
"""
# from tqdm import tqdm
import numpy as np
from typing import List, Union
import torch
import os
from torch import Tensor
from torch.nn import functional as F
import fasttext
import fasttext.util
from string2string.misc.default_tokenizer import Tokenizer


class NeuralEmbeddings:
    """
    This class is an abstract class for neural word embeddings.
    """

    def __init__(self,
        tokenizer: Tokenizer = None,
    ) -> None:
        """
        Constructor.

        Arguments:
            tokenizer (Tokenizer): The tokenizer to use.
        """
        # Set the tokenizer
        if tokenizer is None:
            self.tokenizer = Tokenizer(word_delimiter=" ")


    
    def __call__(self,
        tokens: Union[List[str], str],
        ) -> Tensor:
        """
        This function returns the embeddings of the given tokens.

        Arguments:
            tokens (Union[List[str], str]): The tokens to embed.

        Returns:
            Tensor: The embeddings of the given tokens.
        """
        # Check the tokens
        if isinstance(tokens, str):
            tokens = self.tokenizer.tokenize(tokens)

        # Embed the tokens
        return self.embedding_layer(torch.tensor([self.vocabulary_dict[token] for token in tokens]))
    

    def get_embedding(self,
        tokens: Union[List[str], str]
        ) -> Tensor:
        """
        This function returns the embeddings of the given tokens.

        Arguments:
            tokens (Union[List[str], str]): The tokens to embed.

        Returns:
            Tensor: The embeddings of the given tokens.
        """
        return self.__call__(tokens)


# GloVe embeddings class
[docs]class GloVeEmbeddings(NeuralEmbeddings): """ This class implements the GloVe word embeddings. """ # Pre-trained GloVe embeddings # Source: https://github.com/stanfordnlp/GloVe#download-pre-trained-word-vectors MODEL_OPTIONS = { 'glove.6B.200d': { 'Description': 'Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 300d vectors, 822 MB download)', 'URL': 'https://huggingface.co/stanfordnlp/glove/resolve/main/glove.6B.zip', }, 'glove.twitter.27B': { 'Description': 'Twitter (27B tokens, 1.2M vocab, uncased, 200d vectors, 1.42 GB download)', 'URL': 'https://huggingface.co/stanfordnlp/glove/resolve/main/glove.twitter.27B.zip', }, 'glove.42B.300d': { 'Description': 'Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors, 1.75 GB download)', 'URL': 'https://huggingface.co/stanfordnlp/glove/resolve/main/glove.42B.300d.zip', }, 'glove.840B.300d': { 'Description': 'Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors, 2.03 GB download)', 'URL': 'https://huggingface.co/stanfordnlp/glove/resolve/main/glove.840B.300d.zip', }, }
[docs] def __init__(self, model: str = 'glove.6B.200D', dim: int = 50, force_download: bool = False, dir = None, tokenizer: Tokenizer = None, ) -> None: r""" This function initializes the GloVe embeddings class. Arguments: model (str): The model to use. Default is 'glove.6B.200D'. (Options are: 'glove.6B.200D', 'glove.twitter.27B', 'glove.42B.300d', 'glove.840B.300d'.) dim (int): The dimension of the embeddings. Default is 300. force_download (bool): Whether to force download the model. Default is False. dir (str): The directory to save or load the model. Default is None. tokenizer (Tokenizer): The tokenizer to use. Default is None. Returns: None Raises: ValueError: If the model is not in the MODEL_OPTIONS [glove.6B.200D', 'glove.twitter.27B', 'glove.42B.300d', 'glove.840B.300d']. .. attention:: If you use this class, please make sure to cite the following paper: .. code-block:: latex @inproceedings{pennington2014glove, title={Glove: Global vectors for word representation}, author={Pennington, Jeffrey and Socher, Richard and Manning, Christopher D}, booktitle={Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP)}, pages={1532--1543}, year={2014} } .. note:: * If directory is None, the model will be saved in the torch hub directory. * If the model is not downloaded, it will be downloaded automatically. """ # Check model if model not in self.MODEL_OPTIONS: raise ValueError(f'Invalid model: {model}.') # Set the attributes self.model = model self.force_download = force_download self.dir = dir self.token_size = self.model.split('.')[1] self.dim = dim # Set the path if self.dir is None: self.dir = f'{torch.hub.get_dir()}/{self.model}' # Remove the trailing slash if self.dir[-1] == '/': self.dir = self.dir[:-1] # Download the embeddings if they do not exist or if force_download is True if not os.path.exists(self.dir) or self.force_download: # Create the directory if it does not exist if not (os.path.exists(self.dir)): os.system(f'mkdir {self.dir}') # Download the glove .zip file print(f'Downloading the {self.model} zip file...') torch.hub.download_url_to_file( url=self.MODEL_OPTIONS[self.model]['URL'], dst=f'{self.dir}/glove.zip', ) # Unzip the glove .txt files print(f'Unzipping the {self.model} zip file...') os.system(f'unzip {self.dir}/glove.zip -d {self.dir}') # Delete the zip file os.system(f'rm {self.dir}/glove.zip') # Process each glove .txt file and save it as a .pt file for file in os.listdir(self.dir): # Extract the words and the embeddings from the glove .txt file and save them as a .pt file # Example of a glove .txt file: # the 0.418 0.24968 -0.41242 0.1217 ... # ... # and 0.26818 0.14346 -0.27877 0.016257 ... # ... print(f'Processing {file}...') # Load the file with open(f'{self.dir}/{file}', 'r') as f: lines = f.readlines() # Extract the dimension of the embeddings from the file name (e.g. glove.6B.200d.txt -> 200) file_embed_dim = file.split('.')[2][:-1] # Extract the words and the embeddings words = [] embeddings = np.zeros((len(lines), int(file_embed_dim))) for i, line in enumerate(lines): line = line.split(' ') words.append(line[0]) embeddings[i] = np.array([float(x) for x in line[1:]]) # Convert the embeddings to a tensor embeddings = torch.from_numpy(embeddings) # Save the words and the embeddings as a .pt file torch.save(words, f'{self.dir}/{file[:-4]}.words.pt') torch.save(embeddings, f'{self.dir}/{file[:-4]}.embeddings.pt') # Delete the glove .txt files os.system(f'rm -r {self.dir}/*.txt') # Load the weights and the vocabulary weights = torch.load(f'{self.dir}/glove.{self.token_size}.{self.dim}d.embeddings.pt') vocabulary = torch.load(f'{self.dir}/glove.{self.token_size}.{self.dim}d.words.pt') # If the embeddings already exist else: # Load the weights and the vocabulary weights = torch.load(f'{self.dir}/glove.{self.token_size}.{self.dim}d.embeddings.pt') vocabulary = torch.load(f'{self.dir}/glove.{self.token_size}.{self.dim}d.words.pt') # Create the vocabulary dictionary to be fed to the embedding layer self.vocabulary_dict = {word: i for i, word in enumerate(vocabulary)} # Create the embedding layer self.embedding_layer = torch.nn.Embedding.from_pretrained( embeddings=weights, freeze=True, ) # Set the tokenizer if tokenizer is None: self.tokenizer = Tokenizer() else: self.tokenizer = tokenizer
[docs] def __call__(self, tokens: Union[List[str], str], ) -> Tensor: """ This function returns the embeddings of the given tokens. Arguments: tokens (Union[List[str], str]): The tokens to embed. Returns: Tensor: The embeddings of the given tokens. """ return super().__call__(tokens)
[docs] def get_embedding(self, tokens: Union[List[str], str] ) -> Tensor: r""" This function returns the embeddings of the given tokens. Arguments: tokens (Union[List[str], str]): The tokens to embed. Returns: Tensor: The embeddings of the given tokens. """ return self.__call__(tokens)
# FastTextEmbeddings class
[docs]class FastTextEmbeddings(NeuralEmbeddings): """ This class implements the FastText embeddings. """
[docs] def __init__(self, model: str = 'cc.en.300.bin', force_download: bool = True, dir: str = None, ) -> None: r""" This function initializes the FastTextEmbeddings class. Arguments: model (str): The model to use. Some of the available models are: - 'cc.en.300.bin': The English model trained on Common Crawl (300 dimensions) - 'cc.hi.300.bin': The Hindi model trained on Common Crawl (300 dimensions) - 'cc.fr.300.bin': The French model trained on Common Crawl (300 dimensions) - 'cc.yi.300.bin': The Yiddish model trained on Common Crawl (300 dimensions) - ... - 'wiki.en': The English model trained on Wikipedia (300 dimensions) - 'wiki.simple': The Simple English model trained on Wikipedia (300 dimensions) - 'wiki.ar': The Arabic model trained on Wikipedia (300 dimensions) - 'wiki.bg': The Bulgarian model trained on Wikipedia (300 dimensions) - 'wiki.ca': The Catalan model trained on Wikipedia (300 dimensions) - 'wiki.zh': The Chinese model trained on Wikipedia (300 dimensions) - 'wiki.sw': The Swahili model trained on Wikipedia (300 dimensions) - 'wiki.fr': The French model trained on Wikipedia (300 dimensions) - 'wiki.de': The German model trained on Wikipedia (300 dimensions) - 'wiki.es': The Spanish model trained on Wikipedia (300 dimensions) - 'wiki.it': The Italian model trained on Wikipedia (300 dimensions) - 'wiki.pt': The Portuguese model trained on Wikipedia (300 dimensions) - 'wiki.ru': The Russian model trained on Wikipedia (300 dimensions) - 'wiki.tr': The Turkish model trained on Wikipedia (300 dimensions) - 'wiki.uk': The Ukrainian model trained on Wikipedia (300 dimensions) - 'wiki.vi': The Vietnamese model trained on Wikipedia (300 dimensions) - 'wiki.id': The Indonesian model trained on Wikipedia (300 dimensions) - 'wiki.ja': The Japanese model trained on Wikipedia (300 dimensions) - ... force_download (bool): Whether to force the download of the model. Default: False. dir (str): The directory to save and load the model. Returns: None Raises: ValueError: If the given model is not available. .. attention:: If you make use of this code, please cite the following papers (depending on the model you use): .. code-block:: latex @inproceedings{mikolov2018advances, title={Advances in Pre-Training Distributed Word Representations}, author={Mikolov, Tomas and Grave, Edouard and Bojanowski, Piotr and Puhrsch, Christian and Joulin, Armand}, booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)}, year={2018} } .. code-block:: latex @article{bojanowski2017enriching, title={Enriching Word Vectors with Subword Information}, author={Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas}, journal={Transactions of the Association for Computational Linguistics}, volume={5}, year={2017}, issn={2307-387X}, pages={135--146} } .. code-block:: latex @article{joulin2016fasttext, title={FastText.zip: Compressing text classification models}, author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Douze, Matthijs and J{\'e}gou, H{\'e}rve and Mikolov, Tomas}, journal={arXiv preprint arXiv:1612.03651}, year={2016} } .. note:: * The models are downloaded from https://fasttext.cc/docs/en/english-vectors.html. * The models are saved in the torch hub directory, if no directory is specified. * """ # Set the attributes self.model = model self.dir = dir self.force_download = force_download # Set the path if self.dir is None: # For convenience, we save the model in the torch hub directory self.dir = f'{torch.hub.get_dir()}/{self.model}' # Remove the trailing slash if self.dir[-1] == '/': self.dir = self.dir[:-1] # Download the embeddings if they do not exist or if force_download is True if not os.path.exists(self.dir) or self.force_download: # Create the directory if it does not exist if not os.path.exists(self.dir): os.system(f'mkdir {self.dir}') # Download using wget if 'wiki' in model: # https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip os.system(f'wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/{model}.zip -P {self.dir}') os.system(f'unzip {self.dirl}.zip -d {self.dir}') os.system(f'rm {self.dir}.zip') else: # https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz os.system(f'wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/{model}.gz -P {self.dir}') os.system(f'gunzip {self.dir}.gz -d {self.dir}') os.system(f'rm {self.dir}.gz') # Load the model ft = fasttext.load_model(f'{self.dir}/{model}') # Get the vocabulary words = ft.get_words() # Convert the embeddings to a tensor embeddings =torch.tensor(ft.get_input_matrix()) # Save the words and the embeddings as a .pt file torch.save(words, f'{self.dir}/{model}.words.pt') torch.save(embeddings, f'{self.dir}/{model}.embeddings.pt') # Delete the model del ft else: try: # Load the words and the embeddings words = torch.load(f'{self.dir}/{model}.words.pt') embeddings = torch.load(f'{self.dir}/{model}.embeddings.pt') except: raise Exception(f'Please install the {model} model first by setting force_download to True.') # Create the vocabulary dictionary to be fed to the embedding layer self.vocabulary_dict = {word: i for i, word in enumerate(words)} # Create the embedding layer self.embedding_layer = torch.nn.Embedding.from_pretrained( embeddings=embeddings, freeze=True, )
[docs] def __call__(self, tokens: Union[List[str], str], ) -> Tensor: """ This function returns the embeddings of the given tokens. Arguments: tokens (Union[List[str], str]): The tokens to embed. Returns: Tensor: The embeddings of the given tokens. """ return super().__call__(tokens)
[docs] def get_embedding(self, tokens: Union[List[str], str] ) -> Tensor: """ This function returns the embeddings of the given tokens. Arguments: tokens (Union[List[str], str]): The tokens to embed. Returns: Tensor: The embeddings of the given tokens. """ return self.__call__(tokens)