Source code for docarray.document

from typing import overload, Dict, Optional, List, TYPE_CHECKING, Sequence, Any

from docarray.document.data import DocumentData
from docarray.document.mixins import AllMixins
from docarray.base import BaseDCType
from docarray.math.ndarray import detach_tensor_if_present

if TYPE_CHECKING:  # pragma: no cover
    from docarray.typing import ArrayType, StructValueType, DocumentContentType


[docs]class Document(AllMixins, BaseDCType):
    """Document is the basic data type in DocArray.
    A Document is a container for any kind of data, be it text, image, audio, video, or 3D meshes.

    You can initialize a Document object with given attributes:

    .. code-block:: python

        from docarray import Document
        import numpy

        d1 = Document(text='hello')
        d3 = Document(tensor=numpy.array([1, 2, 3]))
        d4 = Document(
            uri='https://jina.ai',
            mime_type='text/plain',
            granularity=1,
            adjacency=3,
            tags={'foo': 'bar'},
        )

    Documents support a :ref:`nested structure <recursive-nested-document>`, which can also be specified during construction:

    .. code-block:: python

        d = Document(
            id='d0',
            chunks=[Document(id='d1', chunks=Document(id='d2'))],
            matches=[Document(id='d3')],
        )

    A Document can embed its contents using the :meth:`embed` method and a provided embedding model:

    .. code-block:: python

        import torchvision

        q = (
            Document(uri='/Users/usr/path/to/image.jpg')
            .load_uri_to_image_tensor()
            .set_image_tensor_normalization()
            .set_image_tensor_channel_axis(-1, 0)
        )
        model = torchvision.models.resnet50(pretrained=True)
        q.embed(model)

    Multiple Documents can be organized into a :class:`~docarray.array.document.DocumentArray`.

    .. seealso::
        For further details, see our :ref:`user guide <document>`.
    """

    _data_class = DocumentData
    _unresolved_fields_dest = 'tags'
    _post_init_fields = (
        'text',
        'blob',
        'tensor',
        'content',
        'uri',
        'mime_type',
        'chunks',
        'matches',
    )

    @overload
    def __init__(self):
        """Create an empty Document."""
        ...

    @overload
    def __init__(self, _obj: Optional['Document'] = None, copy: bool = False):
        ...

    @overload
    def __init__(self, _obj: Optional[Any] = None):
        """Create a Document from a `docarray.dataclass` instance"""
        ...

    @overload
    def __init__(
        self,
        _obj: Optional[Dict],
        copy: bool = False,
        field_resolver: Optional[Dict[str, str]] = None,
        unknown_fields_handler: str = 'catch',
    ):
        ...

    @overload
    def __init__(self, blob: Optional[bytes] = None, **kwargs):
        """Create a Document with binary content."""
        ...

    @overload
    def __init__(self, tensor: Optional['ArrayType'] = None, **kwargs):
        """Create a Document with NdArray-like content."""
        ...

    @overload
    def __init__(self, text: Optional[str] = None, **kwargs):
        """Create a Document with string content."""
        ...

    @overload
    def __init__(self, uri: Optional[str] = None, **kwargs):
        """Create a Document with content from a URI."""
        ...

    @overload
    def __init__(
        self,
        parent_id: Optional[str] = None,
        granularity: Optional[int] = None,
        adjacency: Optional[int] = None,
        blob: Optional[bytes] = None,
        tensor: Optional['ArrayType'] = None,
        mime_type: Optional[str] = None,
        text: Optional[str] = None,
        content: Optional['DocumentContentType'] = None,
        weight: Optional[float] = None,
        uri: Optional[str] = None,
        tags: Optional[Dict[str, 'StructValueType']] = None,
        offset: Optional[float] = None,
        location: Optional[List[float]] = None,
        embedding: Optional['ArrayType'] = None,
        modality: Optional[str] = None,
        evaluations: Optional[Dict[str, Dict[str, 'StructValueType']]] = None,
        scores: Optional[Dict[str, Dict[str, 'StructValueType']]] = None,
        chunks: Optional[Sequence['Document']] = None,
        matches: Optional[Sequence['Document']] = None,
    ):
        ...

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def __getstate__(self):
        state = self.__dict__.copy()

        for attribute in ['embedding', 'tensor']:
            if hasattr(self, attribute):
                setattr(
                    state['_data'],
                    attribute,
                    detach_tensor_if_present(getattr(state['_data'], attribute)),
                )

        return state