Source code for docarray.array.mixins.text
from collections import Counter
from typing import Tuple, Dict
[docs]class TextToolsMixin:
"""Help functions used in NLP for DA and DAM"""
[docs] def get_vocabulary(
self, min_freq: int = 1, text_attrs: Tuple[str, ...] = ('text',)
) -> Dict[str, int]:
"""Get the text vocabulary in a dict that maps from the word to the index from all Documents.
:param text_attrs: the textual attributes where vocabulary will be derived from
:param min_freq: the minimum word frequency to be considered into the vocabulary.
:return: a vocabulary in dictionary where key is the word, value is the index. The value is 2-index, where
`0` is reserved for padding, `1` is reserved for unknown token.
"""
all_tokens = Counter()
for d in self:
all_tokens.update(d.get_vocabulary(text_attrs=text_attrs))
# 0 for padding, 1 for unknown
return {
k: idx
for idx, k in enumerate(
(k for k, v in all_tokens.items() if v >= min_freq), start=2
)
}