Source code for finds.unstructured.vocab

"""Class to manage words vocabulary

Copyright 2022, Terence Lim

MIT License
"""
from typing import Dict, Iterable, List, Any, Tuple, Self, Set
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
from pandas.api import types
import random
import os
import json
import pickle
import nltk
from nltk.tokenize import RegexpTokenizer
from pathlib import Path
_VERBOSE = 0

[docs]class Vocab(): """Class for managing a vocabulary of words Args: words : List of words to create index unk : str representation of unknown word """ tokenize = RegexpTokenizer(r"\b[^\d\W][^\d\W][^\d\W]+\b").tokenize """a default tokenizer, wraps nltk RegexpTokenizer""" def __init__(self, words: List = [], unk: str = '<UNK>'): """Initialize class for managing words vocabulary Examples: >>> Vocab(['hello', 'world']) """ # create bidirectional mapping of words and indexes self.word2idx = {unk: 0} self.idx2word = {0: unk} self.unk = unk self.update(words) self.embeddings = []
[docs] def update(self, words: List): """update words in vocab, in lower case""" idx = len(self.word2idx) for w in words: w = w.lower() if w not in self.word2idx: self.word2idx[w] = idx self.idx2word[idx] = w idx += 1
@property def dim(self) -> int: """returns the dimensionality of the embeddings vector""" return self.embeddings.shape[1]
[docs] def dump(self, filename: str) -> Self: """Dump vocab to file""" with open(filename, "wb") as f: pickle.dump([self.word2idx, self.idx2word, self.embeddings], f)
[docs] def load(self, filename: str) -> Self: """Load vocab from file""" with open(filename, "rb") as f: self.word2idx, self.idx2word, self.embeddings = pickle.load(f)
[docs] def __getitem__(self, item: str | int) -> int | str: """Return index of str item or word of int item""" if isinstance(item, str): return self.word2idx.get(item.lower(), 0) elif isinstance(item, int): return self.idx2word.get(item, self.unk) else: raise Exception("item must be str or int")
[docs] def get_index(self, words: str | List) -> int | List: """Return indexes of words list, optionally drop unknown words""" return ([self.get_index(w) for w in words] if types.is_list_like(words) else self[words])
[docs] def get_word(self, index: int | List) -> str | List: """Return words of indexes""" return ([self.get_word(k) for k in index] if types.is_list_like(index) else self[index])
def __contains__(self, word: str) -> bool: """Returns True (False) if word is in (not in) vocab""" return word in self.word2idx def __len__(self) -> int: """Returns length of vocab""" return len(self.word2idx)
[docs] def set_embeddings(self, embeddings: DataFrame) -> DataFrame: """Relativize and index embeddings to words in vocab""" # default embeddings vector values vectors = np.random.normal(scale=0.6, size=(len(self.word2idx), embeddings.shape[1])) vectors[0] = np.zeros((1, embeddings.shape[1])) # values for unknown word words = Series(self.word2idx) common = list(set(words.index).intersection(embeddings.index)\ .difference(['nan'])) vectors[words[common].values] = embeddings.loc[common].values self.embeddings = vectors
[docs] def get_embeddings(self, word: str | List) -> np.array: """Return embedding vector of a (list of) word""" return (np.vstack([self.get_embeddings(w) for w in word]) if types.is_list_like(word) else self.embeddings[self[word]])
if __name__ == "__main__": from collections import Counter from secret import paths text = ['The quick brown fox jumps over the lazy dog', 'The cow jumps over the moon'] lines = [Vocab.tokenize(line.lower()) for line in text] # Count words for vocab counts = Counter() for line in lines: counts.update(line) words = [w[0] for w in counts.most_common(5)] vocab = Vocab(words) # test it print(vocab['the'], vocab['unk']) print(vocab[2], vocab[0], vocab[1000]) print(vocab.get_index(lines)) print(vocab.get_words([1, 2, 3, 4, 5, 6, 7])) # Read word embeddings vectors as a DataFrame filename = paths['scratch'] / 'glove.6B.300d.txt' sep = " " quoting = 3 df = pd.read_csv(filename, sep=sep, quoting=quoting, header=None, index_col=0, low_memory=True) df.index = df.index.astype(str).str.lower() # convert to lower case # Relativize to vocab vocab.set_embeddings(df)