Source code for docarray.array.mixins.text

from collections import Counter
from typing import Tuple, Dict


[docs]class TextToolsMixin:
    """Help functions used in NLP for DA and DAM"""

[docs]    def get_vocabulary(
        self, min_freq: int = 1, text_attrs: Tuple[str, ...] = ('text',)
    ) -> Dict[str, int]:
        """Get the text vocabulary in a dict that maps from the word to the index from all Documents.

        :param text_attrs: the textual attributes where vocabulary will be derived from
        :param min_freq: the minimum word frequency to be considered into the vocabulary.
        :return: a vocabulary in dictionary where key is the word, value is the index. The value is 2-index, where
            `0` is reserved for padding, `1` is reserved for unknown token.
        """

        all_tokens = Counter()
        for d in self:
            all_tokens.update(d.get_vocabulary(text_attrs=text_attrs))

        # 0 for padding, 1 for unknown
        return {
            k: idx
            for idx, k in enumerate(
                (k for k, v in all_tokens.items() if v >= min_freq), start=2
            )
        }