from typing import overload, Dict, Optional, List, TYPE_CHECKING, Sequence, Any
from docarray.document.data import DocumentData
from docarray.document.mixins import AllMixins
from docarray.base import BaseDCType
from docarray.math.ndarray import detach_tensor_if_present
if TYPE_CHECKING: # pragma: no cover
from docarray.typing import ArrayType, StructValueType, DocumentContentType
[docs]class Document(AllMixins, BaseDCType):
"""Document is the basic data type in DocArray.
A Document is a container for any kind of data, be it text, image, audio, video, or 3D meshes.
You can initialize a Document object with given attributes:
.. code-block:: python
from docarray import Document
import numpy
d1 = Document(text='hello')
d3 = Document(tensor=numpy.array([1, 2, 3]))
d4 = Document(
uri='https://jina.ai',
mime_type='text/plain',
granularity=1,
adjacency=3,
tags={'foo': 'bar'},
)
Documents support a :ref:`nested structure <recursive-nested-document>`, which can also be specified during construction:
.. code-block:: python
d = Document(
id='d0',
chunks=[Document(id='d1', chunks=Document(id='d2'))],
matches=[Document(id='d3')],
)
A Document can embed its contents using the :meth:`embed` method and a provided embedding model:
.. code-block:: python
import torchvision
q = (
Document(uri='/Users/usr/path/to/image.jpg')
.load_uri_to_image_tensor()
.set_image_tensor_normalization()
.set_image_tensor_channel_axis(-1, 0)
)
model = torchvision.models.resnet50(pretrained=True)
q.embed(model)
Multiple Documents can be organized into a :class:`~docarray.array.document.DocumentArray`.
.. seealso::
For further details, see our :ref:`user guide <document>`.
"""
_data_class = DocumentData
_unresolved_fields_dest = 'tags'
_post_init_fields = (
'text',
'blob',
'tensor',
'content',
'uri',
'mime_type',
'chunks',
'matches',
)
@overload
def __init__(self):
"""Create an empty Document."""
...
@overload
def __init__(self, _obj: Optional['Document'] = None, copy: bool = False):
...
@overload
def __init__(self, _obj: Optional[Any] = None):
"""Create a Document from a `docarray.dataclass` instance"""
...
@overload
def __init__(
self,
_obj: Optional[Dict],
copy: bool = False,
field_resolver: Optional[Dict[str, str]] = None,
unknown_fields_handler: str = 'catch',
):
...
@overload
def __init__(self, blob: Optional[bytes] = None, **kwargs):
"""Create a Document with binary content."""
...
@overload
def __init__(self, tensor: Optional['ArrayType'] = None, **kwargs):
"""Create a Document with NdArray-like content."""
...
@overload
def __init__(self, text: Optional[str] = None, **kwargs):
"""Create a Document with string content."""
...
@overload
def __init__(self, uri: Optional[str] = None, **kwargs):
"""Create a Document with content from a URI."""
...
@overload
def __init__(
self,
parent_id: Optional[str] = None,
granularity: Optional[int] = None,
adjacency: Optional[int] = None,
blob: Optional[bytes] = None,
tensor: Optional['ArrayType'] = None,
mime_type: Optional[str] = None,
text: Optional[str] = None,
content: Optional['DocumentContentType'] = None,
weight: Optional[float] = None,
uri: Optional[str] = None,
tags: Optional[Dict[str, 'StructValueType']] = None,
offset: Optional[float] = None,
location: Optional[List[float]] = None,
embedding: Optional['ArrayType'] = None,
modality: Optional[str] = None,
evaluations: Optional[Dict[str, Dict[str, 'StructValueType']]] = None,
scores: Optional[Dict[str, Dict[str, 'StructValueType']]] = None,
chunks: Optional[Sequence['Document']] = None,
matches: Optional[Sequence['Document']] = None,
):
...
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def __getstate__(self):
state = self.__dict__.copy()
for attribute in ['embedding', 'tensor']:
if hasattr(self, attribute):
setattr(
state['_data'],
attribute,
detach_tensor_if_present(getattr(state['_data'], attribute)),
)
return state