Source code for string2string.misc.word_embeddings

"""
This module implements the word embeddings class.
"""
# from tqdm import tqdm
import numpy as np
from typing import List, Union
import torch
import os
from torch import Tensor
from torch.nn import functional as F
import fasttext
import fasttext.util
from string2string.misc.default_tokenizer import Tokenizer


class NeuralEmbeddings:
    """
    This class is an abstract class for neural word embeddings.
    """

    def __init__(self,
        tokenizer: Tokenizer = None,
    ) -> None:
        """
        Constructor.

        Arguments:
            tokenizer (Tokenizer): The tokenizer to use.
        """
        # Set the tokenizer
        if tokenizer is None:
            self.tokenizer = Tokenizer(word_delimiter=" ")


    
    def __call__(self,
        tokens: Union[List[str], str],
        ) -> Tensor:
        """
        This function returns the embeddings of the given tokens.

        Arguments:
            tokens (Union[List[str], str]): The tokens to embed.

        Returns:
            Tensor: The embeddings of the given tokens.
        """
        # Check the tokens
        if isinstance(tokens, str):
            tokens = self.tokenizer.tokenize(tokens)

        # Embed the tokens
        return self.embedding_layer(torch.tensor([self.vocabulary_dict[token] for token in tokens]))
    

    def get_embedding(self,
        tokens: Union[List[str], str]
        ) -> Tensor:
        """
        This function returns the embeddings of the given tokens.

        Arguments:
            tokens (Union[List[str], str]): The tokens to embed.

        Returns:
            Tensor: The embeddings of the given tokens.
        """
        return self.__call__(tokens)


# GloVe embeddings class
[docs]class GloVeEmbeddings(NeuralEmbeddings):
    """
    This class implements the GloVe word embeddings.
    """
    # Pre-trained GloVe embeddings
    # Source: https://github.com/stanfordnlp/GloVe#download-pre-trained-word-vectors
    MODEL_OPTIONS = {
        'glove.6B.200d': {
            'Description': 'Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 300d vectors, 822 MB download)',
            'URL': 'https://huggingface.co/stanfordnlp/glove/resolve/main/glove.6B.zip',
            },
        'glove.twitter.27B': {
            'Description': 'Twitter (27B tokens, 1.2M vocab, uncased, 200d vectors, 1.42 GB download)',
            'URL': 'https://huggingface.co/stanfordnlp/glove/resolve/main/glove.twitter.27B.zip',
        },
        'glove.42B.300d': {
            'Description': 'Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors, 1.75 GB download)',
            'URL': 'https://huggingface.co/stanfordnlp/glove/resolve/main/glove.42B.300d.zip',
        },
        'glove.840B.300d': {
            'Description': 'Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors, 2.03 GB download)',
            'URL': 'https://huggingface.co/stanfordnlp/glove/resolve/main/glove.840B.300d.zip',
        },
    }

[docs]    def __init__(self, 
        model: str = 'glove.6B.200D',
        dim: int = 50,
        force_download: bool = False,
        dir = None,
        tokenizer: Tokenizer = None,
        ) -> None:
        r"""
        This function initializes the GloVe embeddings class.

        Arguments:
            model (str): The model to use. Default is 'glove.6B.200D'. (Options are: 'glove.6B.200D', 'glove.twitter.27B', 'glove.42B.300d', 'glove.840B.300d'.)
            dim (int): The dimension of the embeddings. Default is 300.
            force_download (bool): Whether to force download the model. Default is False.
            dir (str): The directory to save or load the model. Default is None.
            tokenizer (Tokenizer): The tokenizer to use. Default is None.

        Returns:
            None

        Raises:
            ValueError: If the model is not in the MODEL_OPTIONS [glove.6B.200D', 'glove.twitter.27B', 'glove.42B.300d', 'glove.840B.300d'].

        
        .. attention::

            If you use this class, please make sure to cite the following paper:
        
            .. code-block:: latex

                 @inproceedings{pennington2014glove,
                    title={Glove: Global vectors for word representation},
                    author={Pennington, Jeffrey and Socher, Richard and Manning, Christopher D},
                    booktitle={Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP)},
                    pages={1532--1543},
                    year={2014}
                }

        
        .. note::
            * If directory is None, the model will be saved in the torch hub directory.
            * If the model is not downloaded, it will be downloaded automatically.
        """
        # Check model
        if model not in self.MODEL_OPTIONS:
            raise ValueError(f'Invalid model: {model}.')
        
        # Set the attributes
        self.model = model
        self.force_download = force_download
        self.dir = dir
        self.token_size = self.model.split('.')[1]
        self.dim = dim

        # Set the path
        if self.dir is None:
            self.dir = f'{torch.hub.get_dir()}/{self.model}'

        # Remove the trailing slash
        if self.dir[-1] == '/':
            self.dir = self.dir[:-1]

        # Download the embeddings if they do not exist or if force_download is True
        if not os.path.exists(self.dir) or self.force_download:

            # Create the directory if it does not exist            
            if not (os.path.exists(self.dir)):
                os.system(f'mkdir {self.dir}')

            # Download the glove .zip file
            print(f'Downloading the {self.model} zip file...')
            torch.hub.download_url_to_file(
                url=self.MODEL_OPTIONS[self.model]['URL'],
                dst=f'{self.dir}/glove.zip',
            )

            # Unzip the glove .txt files
            print(f'Unzipping the {self.model} zip file...')
            os.system(f'unzip {self.dir}/glove.zip -d {self.dir}')

            # Delete the zip file
            os.system(f'rm {self.dir}/glove.zip')

            # Process each glove .txt file and save it as a .pt file
            for file in os.listdir(self.dir):
                # Extract the words and the embeddings from the glove .txt file and save them as a .pt file
                
                # Example of a glove .txt file:
                # the 0.418 0.24968 -0.41242 0.1217 ... 
                # ...
                # and 0.26818 0.14346 -0.27877 0.016257 ...
                # ...

                print(f'Processing {file}...')

                # Load the file
                with open(f'{self.dir}/{file}', 'r') as f:
                    lines = f.readlines()
                
                # Extract the dimension of the embeddings from the file name (e.g. glove.6B.200d.txt -> 200)
                file_embed_dim = file.split('.')[2][:-1]

                # Extract the words and the embeddings
                words = []
                embeddings = np.zeros((len(lines), int(file_embed_dim)))
                for i, line in enumerate(lines):
                    line = line.split(' ')
                    words.append(line[0])
                    embeddings[i] = np.array([float(x) for x in line[1:]])
                
                # Convert the embeddings to a tensor
                embeddings = torch.from_numpy(embeddings)

                # Save the words and the embeddings as a .pt file
                torch.save(words, f'{self.dir}/{file[:-4]}.words.pt')
                torch.save(embeddings, f'{self.dir}/{file[:-4]}.embeddings.pt')

            # Delete the glove .txt files
            os.system(f'rm -r {self.dir}/*.txt')

            # Load the weights and the vocabulary
            weights = torch.load(f'{self.dir}/glove.{self.token_size}.{self.dim}d.embeddings.pt')
            vocabulary = torch.load(f'{self.dir}/glove.{self.token_size}.{self.dim}d.words.pt')
        
        # If the embeddings already exist
        else:
            # Load the weights and the vocabulary
            weights = torch.load(f'{self.dir}/glove.{self.token_size}.{self.dim}d.embeddings.pt')
            vocabulary = torch.load(f'{self.dir}/glove.{self.token_size}.{self.dim}d.words.pt')

        # Create the vocabulary dictionary to be fed to the embedding layer
        self.vocabulary_dict = {word: i for i, word in enumerate(vocabulary)}

        # Create the embedding layer
        self.embedding_layer = torch.nn.Embedding.from_pretrained(
            embeddings=weights,
            freeze=True,
        )

        # Set the tokenizer
        if tokenizer is None:
            self.tokenizer = Tokenizer()
        else:
            self.tokenizer = tokenizer


[docs]    def __call__(self,
        tokens: Union[List[str], str],
        ) -> Tensor:
        """
        This function returns the embeddings of the given tokens.

        Arguments:
            tokens (Union[List[str], str]): The tokens to embed.

        Returns:
            Tensor: The embeddings of the given tokens.
        """
        return super().__call__(tokens)
    

[docs]    def get_embedding(self,
        tokens: Union[List[str], str]
        ) -> Tensor:
        r"""
        This function returns the embeddings of the given tokens.

        Arguments:
            tokens (Union[List[str], str]): The tokens to embed.

        Returns:
            Tensor: The embeddings of the given tokens.
        """
        return self.__call__(tokens)


# FastTextEmbeddings class
[docs]class FastTextEmbeddings(NeuralEmbeddings):
    """
    This class implements the FastText embeddings.        
    """
[docs]    def __init__(self,
        model: str = 'cc.en.300.bin',
        force_download: bool = True,
        dir: str = None,
        ) -> None:
        r"""
        This function initializes the FastTextEmbeddings class.

        Arguments:
            model (str): The model to use. Some of the available models are:

                - 'cc.en.300.bin': The English model trained on Common Crawl (300 dimensions)
                - 'cc.hi.300.bin': The Hindi model trained on Common Crawl (300 dimensions)
                - 'cc.fr.300.bin': The French model trained on Common Crawl (300 dimensions)
                - 'cc.yi.300.bin': The Yiddish model trained on Common Crawl (300 dimensions)
                -  ... 
                - 'wiki.en': The English model trained on Wikipedia (300 dimensions)
                - 'wiki.simple': The Simple English model trained on Wikipedia (300 dimensions)
                - 'wiki.ar': The Arabic model trained on Wikipedia (300 dimensions)
                - 'wiki.bg': The Bulgarian model trained on Wikipedia (300 dimensions)
                - 'wiki.ca': The Catalan model trained on Wikipedia (300 dimensions)
                - 'wiki.zh': The Chinese model trained on Wikipedia (300 dimensions)
                - 'wiki.sw': The Swahili model trained on Wikipedia (300 dimensions)
                - 'wiki.fr': The French model trained on Wikipedia (300 dimensions)
                - 'wiki.de': The German model trained on Wikipedia (300 dimensions)
                - 'wiki.es': The Spanish model trained on Wikipedia (300 dimensions)
                - 'wiki.it': The Italian model trained on Wikipedia (300 dimensions)
                - 'wiki.pt': The Portuguese model trained on Wikipedia (300 dimensions)
                - 'wiki.ru': The Russian model trained on Wikipedia (300 dimensions)
                - 'wiki.tr': The Turkish model trained on Wikipedia (300 dimensions)
                - 'wiki.uk': The Ukrainian model trained on Wikipedia (300 dimensions)
                - 'wiki.vi': The Vietnamese model trained on Wikipedia (300 dimensions)
                - 'wiki.id': The Indonesian model trained on Wikipedia (300 dimensions)
                - 'wiki.ja': The Japanese model trained on Wikipedia (300 dimensions)
                - ... 
            
            force_download (bool): Whether to force the download of the model. Default: False.
            dir (str): The directory to save and load the model. 

        Returns:
            None

        Raises:
            ValueError: If the given model is not available.

        .. attention::

            If you make use of this code, please cite the following papers (depending on the model you use):
    
            .. code-block:: latex 

                @inproceedings{mikolov2018advances,
                    title={Advances in Pre-Training Distributed Word Representations},
                    author={Mikolov, Tomas and Grave, Edouard and Bojanowski, Piotr and Puhrsch, Christian and Joulin, Armand},
                    booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)},
                    year={2018}
                }

            .. code-block:: latex 

                @article{bojanowski2017enriching,
                    title={Enriching Word Vectors with Subword Information},
                    author={Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas},
                    journal={Transactions of the Association for Computational Linguistics},
                    volume={5},
                    year={2017},
                    issn={2307-387X},
                    pages={135--146}
                }

            .. code-block:: latex 

                @article{joulin2016fasttext,
                    title={FastText.zip: Compressing text classification models},
                    author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Douze, Matthijs and J{\'e}gou, H{\'e}rve and Mikolov, Tomas},
                    journal={arXiv preprint arXiv:1612.03651},
                    year={2016}
                }

        .. note::

            * The models are downloaded from https://fasttext.cc/docs/en/english-vectors.html.
            * The models are saved in the torch hub directory, if no directory is specified.
            * 
        """ 

        # Set the attributes
        self.model = model
        self.dir = dir
        self.force_download = force_download

        # Set the path
        if self.dir is None:
            # For convenience, we save the model in the torch hub directory
            self.dir = f'{torch.hub.get_dir()}/{self.model}'

        # Remove the trailing slash
        if self.dir[-1] == '/':
            self.dir = self.dir[:-1]

        # Download the embeddings if they do not exist or if force_download is True
        if not os.path.exists(self.dir) or self.force_download:
            # Create the directory if it does not exist            
            if not os.path.exists(self.dir):
                os.system(f'mkdir {self.dir}')

            # Download using wget
            if 'wiki' in model:
                # https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip
                os.system(f'wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/{model}.zip -P {self.dir}')
                os.system(f'unzip {self.dirl}.zip -d {self.dir}')
                os.system(f'rm {self.dir}.zip')
            else:
                # https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
                os.system(f'wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/{model}.gz -P {self.dir}')
                os.system(f'gunzip {self.dir}.gz -d {self.dir}')
                os.system(f'rm {self.dir}.gz')
            
            # Load the model
            ft = fasttext.load_model(f'{self.dir}/{model}')

            # Get the vocabulary
            words = ft.get_words()

            # Convert the embeddings to a tensor
            embeddings =torch.tensor(ft.get_input_matrix())

            # Save the words and the embeddings as a .pt file
            torch.save(words, f'{self.dir}/{model}.words.pt')
            torch.save(embeddings, f'{self.dir}/{model}.embeddings.pt')

            # Delete the model
            del ft

        else:
            try:
                # Load the words and the embeddings
                words = torch.load(f'{self.dir}/{model}.words.pt')
                embeddings = torch.load(f'{self.dir}/{model}.embeddings.pt')
            except:
                raise Exception(f'Please install the {model} model first by setting force_download to True.') 

        # Create the vocabulary dictionary to be fed to the embedding layer
        self.vocabulary_dict = {word: i for i, word in enumerate(words)}

        # Create the embedding layer
        self.embedding_layer = torch.nn.Embedding.from_pretrained(
            embeddings=embeddings,
            freeze=True,
        )

[docs]    def __call__(self,
        tokens: Union[List[str], str],
        ) -> Tensor:
        """
        This function returns the embeddings of the given tokens.

        Arguments:
            tokens (Union[List[str], str]): The tokens to embed.

        Returns:
            Tensor: The embeddings of the given tokens.
        """
        return super().__call__(tokens)
    

[docs]    def get_embedding(self,
        tokens: Union[List[str], str]
        ) -> Tensor:
        """
        This function returns the embeddings of the given tokens.

        Arguments:
            tokens (Union[List[str], str]): The tokens to embed.

        Returns:
            Tensor: The embeddings of the given tokens.
        """
        return self.__call__(tokens)