Source code for docarray.document.mixins.text

from collections import Counter
from typing import Tuple, Dict, Union, Optional, TYPE_CHECKING

import numpy as np

from docarray.document.mixins.helper import _uri_to_blob, _to_datauri

if TYPE_CHECKING:  # pragma: no cover
    from docarray.typing import T

[docs]class TextDataMixin: """Provide helper functions for :class:`Document` to support text data."""
[docs] def load_uri_to_text(self: 'T', charset: str = 'utf-8', **kwargs) -> 'T': """Convert :attr:`.uri` to :attr`.text` inplace. :param charset: charset may be any character set registered with IANA :param kwargs: keyword arguments to pass to `:meth:_uri_to_blob` such as timeout :return: itself after processed """ blob = _uri_to_blob(self.uri, **kwargs) self.text = blob.decode(charset) return self
[docs] def get_vocabulary(self, text_attrs: Tuple[str, ...] = ('text',)) -> Dict[str, int]: """Get the text vocabulary in a counter dict that maps from the word to its frequency from all :attr:`text_fields`. :param text_attrs: the textual attributes where vocabulary will be derived from :return: a vocabulary in dictionary where key is the word, value is the frequency of that word in all text fields. """ all_tokens = Counter() for f in text_attrs: all_tokens.update(_text_to_word_sequence(getattr(self, f))) return all_tokens
[docs] def convert_text_to_tensor( self: 'T', vocab: Dict[str, int], max_length: Optional[int] = None, dtype: str = 'int64', ) -> 'T': """Convert :attr:`.text` to :attr:`.tensor` inplace. In the end :attr:`.tensor` will be a 1D array where `D` is `max_length`. To get the vocab of a DocumentArray, you can use `jina.types.document.converters.build_vocab` to :param vocab: a dictionary that maps a word to an integer index, `0` is reserved for padding, `1` is reserved for unknown words in :attr:`.text`. So you should *not* include these two entries in `vocab`. :param max_length: the maximum length of the sequence. Sequence longer than this are cut off from *beginning*. Sequence shorter than this will be padded with `0` from right hand side. :param dtype: the dtype of the generated :attr:`.tensor` :return: Document itself after processed """ self.tensor = np.array( _text_to_int_sequence(self.text, vocab, max_length), dtype=dtype ) return self
[docs] def convert_tensor_to_text( self: 'T', vocab: Union[Dict[str, int], Dict[int, str]], delimiter: str = ' ' ) -> 'T': """Convert :attr:`.tensor` to :attr:`.text` inplace. :param vocab: a dictionary that maps a word to an integer index, `0` is reserved for padding, `1` is reserved for unknown words in :attr:`.text` :param delimiter: the delimiter that used to connect all words into :attr:`.text` :return: Document itself after processed """ if isinstance(list(vocab.keys())[0], str): _vocab = {v: k for k, v in vocab.items()} _text = [] for k in self.tensor: k = int(k) if k == 0: continue elif k == 1: _text.append('<UNK>') else: _text.append(_vocab.get(k, '<UNK>')) self.text = delimiter.join(_text) return self
[docs] def convert_text_to_datauri( self: 'T', charset: str = 'utf-8', base64: bool = False ) -> 'T': """Convert :attr:`.text` to data :attr:`.uri`. :param charset: charset may be any character set registered with IANA :param base64: used to encode arbitrary octet sequences into a form that satisfies the rules of 7bit. Designed to be efficient for non-text 8 bit and binary data. Sometimes used for text data that frequently uses non-US-ASCII characters. :return: itself after processed """ self.uri = _to_datauri(self.mime_type, self.text, charset, base64, binary=False) return self
def _text_to_word_sequence( text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', split=' ' ): translate_dict = {c: split for c in filters} translate_map = str.maketrans(translate_dict) text = text.lower().translate(translate_map) seq = text.split(split) for i in seq: if i: yield i def _text_to_int_sequence(text, vocab, max_len=None): seq = _text_to_word_sequence(text) vec = [vocab.get(s, 1) for s in seq] if max_len: if len(vec) < max_len: vec = [0] * (max_len - len(vec)) + vec elif len(vec) > max_len: vec = vec[-max_len:] return vec