Source code for finds.unstructured.vocab

"""Class to manage words vocabulary

Copyright 2022, Terence Lim

MIT License
"""
from typing import Dict, Iterable, List, Any, Tuple, Self, Set
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
from pandas.api import types
import random
import os
import json
import pickle
import nltk
from nltk.tokenize import RegexpTokenizer
from pathlib import Path
_VERBOSE = 0

[docs]class Vocab():
    """Class for managing a vocabulary of words
        
    Args:
      words : List of words to create index
      unk : str representation of unknown word
    """

    tokenize = RegexpTokenizer(r"\b[^\d\W][^\d\W][^\d\W]+\b").tokenize
    """a default tokenizer, wraps nltk RegexpTokenizer"""

    def __init__(self, words: List = [], unk: str = '<UNK>'):
        """Initialize class for managing words vocabulary

        Examples:
        >>> Vocab(['hello', 'world'])
        """

        # create bidirectional mapping of words and indexes
        self.word2idx = {unk: 0}
        self.idx2word = {0: unk}
        self.unk = unk
        self.update(words)
        self.embeddings = []

[docs]    def update(self, words: List):
        """update words in vocab, in lower case"""
        idx = len(self.word2idx)
        for w in words:
            w = w.lower()
            if w not in self.word2idx:
                self.word2idx[w] = idx
                self.idx2word[idx] = w
                idx += 1

    @property
    def dim(self) -> int:
        """returns the dimensionality of the embeddings vector"""
        return self.embeddings.shape[1]

[docs]    def dump(self, filename: str) -> Self:
        """Dump vocab to file"""
        with open(filename, "wb") as f:
            pickle.dump([self.word2idx, self.idx2word, self.embeddings], f)
    
[docs]    def load(self, filename: str) -> Self:
        """Load vocab from file"""
        with open(filename, "rb") as f:
            self.word2idx, self.idx2word, self.embeddings = pickle.load(f)

[docs]    def __getitem__(self, item: str | int) -> int | str:
        """Return index of str item or word of int item"""
        if isinstance(item, str):
            return self.word2idx.get(item.lower(), 0)
        elif isinstance(item, int):
            return self.idx2word.get(item, self.unk)
        else:
            raise Exception("item must be str or int")
    
[docs]    def get_index(self, words: str | List) -> int | List:
        """Return indexes of words list, optionally drop unknown words"""
        return ([self.get_index(w) for w in words] if types.is_list_like(words)
                else self[words])

[docs]    def get_word(self, index: int | List) -> str | List:
        """Return words of indexes"""
        return ([self.get_word(k) for k in index] if types.is_list_like(index)
                else self[index])

    def __contains__(self, word: str) -> bool:
        """Returns True (False) if word is in (not in) vocab"""
        return word in self.word2idx

    def __len__(self) -> int:
        """Returns length of vocab"""
        return len(self.word2idx)

[docs]    def set_embeddings(self, embeddings: DataFrame) -> DataFrame:
        """Relativize and index embeddings to words in vocab"""
        # default embeddings vector values
        vectors = np.random.normal(scale=0.6,
                                   size=(len(self.word2idx), embeddings.shape[1]))
        vectors[0] = np.zeros((1, embeddings.shape[1]))   # values for unknown word
        words = Series(self.word2idx)
        common = list(set(words.index).intersection(embeddings.index)\
                      .difference(['nan']))
        vectors[words[common].values] = embeddings.loc[common].values
        self.embeddings = vectors

[docs]    def get_embeddings(self, word: str | List) -> np.array:
        """Return embedding vector of a (list of) word"""
        return (np.vstack([self.get_embeddings(w) for w in word])
                if types.is_list_like(word)
                else self.embeddings[self[word]])


if __name__ == "__main__":
    from collections import Counter
    from secret import paths
    
    text = ['The quick brown fox jumps over the lazy dog',
            'The cow jumps over the moon']
    lines = [Vocab.tokenize(line.lower()) for line in text]

    # Count words for vocab
    counts = Counter()
    for line in lines:
        counts.update(line)
    words = [w[0] for w in counts.most_common(5)]
    vocab = Vocab(words)

    # test it
    print(vocab['the'], vocab['unk'])
    print(vocab[2], vocab[0], vocab[1000])
    print(vocab.get_index(lines))
    print(vocab.get_words([1, 2, 3, 4, 5, 6, 7]))
    

    # Read word embeddings vectors as a DataFrame
    filename = paths['scratch'] / 'glove.6B.300d.txt'
    sep = " "
    quoting = 3
    df = pd.read_csv(filename, sep=sep, quoting=quoting,
                     header=None, index_col=0, low_memory=True)
    df.index = df.index.astype(str).str.lower()   # convert to lower case

    # Relativize to vocab
    vocab.set_embeddings(df)