from collections import Counter
from typing import Tuple, Dict, Union, Optional, TYPE_CHECKING
import numpy as np
from docarray.document.mixins.helper import _uri_to_blob, _to_datauri
if TYPE_CHECKING: # pragma: no cover
from docarray.typing import T
[docs]class TextDataMixin:
"""Provide helper functions for :class:`Document` to support text data."""
[docs] def load_uri_to_text(self: 'T', charset: str = 'utf-8', **kwargs) -> 'T':
"""Convert :attr:`.uri` to :attr`.text` inplace.
:param charset: charset may be any character set registered with IANA
:param kwargs: keyword arguments to pass to `:meth:_uri_to_blob` such as timeout
:return: itself after processed
"""
blob = _uri_to_blob(self.uri, **kwargs)
self.text = blob.decode(charset)
return self
[docs] def get_vocabulary(self, text_attrs: Tuple[str, ...] = ('text',)) -> Dict[str, int]:
"""Get the text vocabulary in a counter dict that maps from the word to its frequency from all :attr:`text_fields`.
:param text_attrs: the textual attributes where vocabulary will be derived from
:return: a vocabulary in dictionary where key is the word, value is the frequency of that word in all text fields.
"""
all_tokens = Counter()
for f in text_attrs:
all_tokens.update(_text_to_word_sequence(getattr(self, f)))
return all_tokens
[docs] def convert_text_to_tensor(
self: 'T',
vocab: Dict[str, int],
max_length: Optional[int] = None,
dtype: str = 'int64',
) -> 'T':
"""Convert :attr:`.text` to :attr:`.tensor` inplace.
In the end :attr:`.tensor` will be a 1D array where `D` is `max_length`.
To get the vocab of a DocumentArray, you can use `jina.types.document.converters.build_vocab` to
:param vocab: a dictionary that maps a word to an integer index, `0` is reserved for padding, `1` is reserved
for unknown words in :attr:`.text`. So you should *not* include these two entries in `vocab`.
:param max_length: the maximum length of the sequence. Sequence longer than this are cut off from *beginning*.
Sequence shorter than this will be padded with `0` from right hand side.
:param dtype: the dtype of the generated :attr:`.tensor`
:return: Document itself after processed
"""
self.tensor = np.array(
_text_to_int_sequence(self.text, vocab, max_length), dtype=dtype
)
return self
[docs] def convert_tensor_to_text(
self: 'T', vocab: Union[Dict[str, int], Dict[int, str]], delimiter: str = ' '
) -> 'T':
"""Convert :attr:`.tensor` to :attr:`.text` inplace.
:param vocab: a dictionary that maps a word to an integer index, `0` is reserved for padding, `1` is reserved
for unknown words in :attr:`.text`
:param delimiter: the delimiter that used to connect all words into :attr:`.text`
:return: Document itself after processed
"""
if isinstance(list(vocab.keys())[0], str):
_vocab = {v: k for k, v in vocab.items()}
_text = []
for k in self.tensor:
k = int(k)
if k == 0:
continue
elif k == 1:
_text.append('<UNK>')
else:
_text.append(_vocab.get(k, '<UNK>'))
self.text = delimiter.join(_text)
return self
[docs] def convert_text_to_datauri(
self: 'T', charset: str = 'utf-8', base64: bool = False
) -> 'T':
"""Convert :attr:`.text` to data :attr:`.uri`.
:param charset: charset may be any character set registered with IANA
:param base64: used to encode arbitrary octet sequences into a form that satisfies the rules of 7bit.
Designed to be efficient for non-text 8 bit and binary data.
Sometimes used for text data that frequently uses non-US-ASCII characters.
:return: itself after processed
"""
self.uri = _to_datauri(self.mime_type, self.text, charset, base64, binary=False)
return self
def _text_to_word_sequence(
text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', split=' '
):
translate_dict = {c: split for c in filters}
translate_map = str.maketrans(translate_dict)
text = text.lower().translate(translate_map)
seq = text.split(split)
for i in seq:
if i:
yield i
def _text_to_int_sequence(text, vocab, max_len=None):
seq = _text_to_word_sequence(text)
vec = [vocab.get(s, 1) for s in seq]
if max_len:
if len(vec) < max_len:
vec = [0] * (max_len - len(vec)) + vec
elif len(vec) > max_len:
vec = vec[-max_len:]
return vec