from typing import (
Type,
TYPE_CHECKING,
Optional,
overload,
Union,
List,
TextIO,
Dict,
Iterable,
)
if TYPE_CHECKING: # pragma: no cover
from docarray.typing import T
import numpy as np
import csv
[docs]class FromGeneratorMixin:
"""Provide helper functions filling a :class:`DocumentArray`-like object with a generator."""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Let users know that the from_xyz class methods should not be called on instances
cls_method_names = [
attr
for attr, obj in vars(FromGeneratorMixin).items()
if isinstance(obj, classmethod)
]
for cls_meth in cls_method_names:
setattr(self, cls_meth, _raise_attribute_error)
@classmethod
def _from_generator(cls: Type['T'], meth: str, *args, **kwargs) -> 'T':
from docarray.document import generators
from_fn = getattr(generators, meth)
da_like = cls(**kwargs)
da_like.extend(from_fn(*args, **kwargs))
return da_like
@classmethod
@overload
def from_ndarray(
cls: Type['T'],
array: 'np.ndarray',
axis: int = 0,
size: Optional[int] = None,
shuffle: bool = False,
*args,
**kwargs,
) -> 'T':
"""Build from a numpy array.
:param array: the numpy ndarray data source
:param axis: iterate over that axis
:param size: the maximum number of the sub arrays
:param shuffle: shuffle the numpy data source beforehand
"""
...
[docs] @classmethod
def from_ndarray(cls: Type['T'], *args, **kwargs) -> 'T':
"""
# noqa: DAR101
# noqa: DAR102
# noqa: DAR201
"""
return cls._from_generator('from_ndarray', *args, **kwargs)
@classmethod
@overload
def from_files(
cls: Type['T'],
patterns: Union[str, List[str]],
recursive: bool = True,
size: Optional[int] = None,
sampling_rate: Optional[float] = None,
read_mode: Optional[str] = None,
to_dataturi: bool = False,
exclude_regex: Optional[str] = None,
*args,
**kwargs,
) -> 'T':
"""Build from a list of file path or the content of the files.
:param patterns: The pattern may contain simple shell-style wildcards, e.g. '\*.py', '[\*.zip, \*.gz]'
:param recursive: If recursive is true, the pattern '**' will match any files
and zero or more directories and subdirectories
:param size: the maximum number of the files
:param sampling_rate: the sampling rate between [0, 1]
:param read_mode: specifies the mode in which the file is opened.
'r' for reading in text mode, 'rb' for reading in binary mode.
If `read_mode` is None, will iterate over filenames.
:param to_dataturi: if set, then the Document.uri will be filled with DataURI instead of the plan URI
:param exclude_regex: if set, then filenames that match to this pattern are not included.
"""
...
[docs] @classmethod
def from_files(cls: Type['T'], *args, **kwargs) -> 'T':
"""
# noqa: DAR101
# noqa: DAR102
# noqa: DAR201
"""
return cls._from_generator('from_files', *args, **kwargs)
@classmethod
@overload
def from_csv(
cls: Type['T'],
file: Union[str, TextIO],
field_resolver: Optional[Dict[str, str]] = None,
size: Optional[int] = None,
sampling_rate: Optional[float] = None,
dialect: Union[str, 'csv.Dialect'] = 'excel',
) -> 'T':
"""Build from CSV.
:param file: file paths or file handler
:param field_resolver: a map from field names defined in JSON, dict to the field
names defined in Document.
:param size: the maximum number of the documents
:param sampling_rate: the sampling rate between [0, 1]
:param dialect: define a set of parameters specific to a particular CSV dialect. could be a string that represents
predefined dialects in your system, or could be a :class:`csv.Dialect` class that groups specific formatting
parameters together. If you don't know the dialect and the default one does not work for you,
you can try set it to ``auto``.
"""
...
[docs] @classmethod
def from_csv(cls: Type['T'], *args, **kwargs) -> 'T':
"""
# noqa: DAR101
# noqa: DAR102
# noqa: DAR201
"""
return cls._from_generator('from_csv', *args, **kwargs)
@classmethod
@overload
def from_huggingface_datasets(
cls: Type['T'],
dataset_path: str,
field_resolver: Optional[Dict[str, str]] = None,
size: Optional[int] = None,
sampling_rate: Optional[float] = None,
filter_fields: bool = False,
**datasets_kwargs,
) -> 'T':
"""Build from Hugging Face Datasets. Yields documents.
This function helps to load datasets from Hugging Face Datasets Hub
(https://huggingface.co/datasets) in Jina. Additional parameters can be
passed to the ``datasets`` library using keyword arguments. The ``load_dataset``
method from ``datasets`` library is used to load the datasets.
:param dataset_path: a valid dataset path for Hugging Face Datasets library.
:param field_resolver: a map from field names defined in ``document`` (JSON, dict) to the field
names defined in Protobuf. This is only used when the given ``document`` is
a JSON string or a Python dict.
:param size: the maximum number of the documents
:param sampling_rate: the sampling rate between [0, 1]
:param filter_fields: specifies whether to filter the dataset with the fields
given in ```field_resolver`` argument.
:param **datasets_kwargs: additional arguments for ``load_dataset`` method
from Datasets library. More details at
https://huggingface.co/docs/datasets/package_reference/loading_methods.html#datasets.load_dataset
"""
...
[docs] @classmethod
def from_huggingface_datasets(cls: Type['T'], *args, **kwargs) -> 'T':
"""
# noqa: DAR101
# noqa: DAR102
# noqa: DAR201
"""
return cls._from_generator('from_huggingface_datasets', *args, **kwargs)
@classmethod
@overload
def from_ndjson(
cls: Type['T'],
fp: Iterable[str],
field_resolver: Optional[Dict[str, str]] = None,
size: Optional[int] = None,
sampling_rate: Optional[float] = None,
) -> 'T':
"""Build from line separated JSON. Yields documents.
:param fp: file paths
:param field_resolver: a map from field names defined in ``document`` (JSON, dict) to the field
names defined in Protobuf. This is only used when the given ``document`` is
a JSON string or a Python dict.
:param size: the maximum number of the documents
:param sampling_rate: the sampling rate between [0, 1]
"""
...
[docs] @classmethod
def from_ndjson(cls: Type['T'], *args, **kwargs) -> 'T':
"""
# noqa: DAR101
# noqa: DAR102
# noqa: DAR201
"""
return cls._from_generator('from_ndjson', *args, **kwargs)
@classmethod
@overload
def from_lines(
cls: 'T',
lines: Optional[Iterable[str]] = None,
filepath: Optional[str] = None,
read_mode: str = 'r',
line_format: str = 'json',
field_resolver: Optional[Dict[str, str]] = None,
size: Optional[int] = None,
sampling_rate: Optional[float] = None,
) -> 'T':
"""Build from lines, json and csv. Yields documents or strings.
:param lines: a list of strings, each is considered as a document
:param filepath: a text file that each line contains a document
:param read_mode: specifies the mode in which the file
is opened. 'r' for reading in text mode, 'rb' for reading in binary
:param line_format: the format of each line ``json`` or ``csv``
:param field_resolver: a map from field names defined in ``document`` (JSON, dict) to the field
names defined in Protobuf. This is only used when the given ``document`` is
a JSON string or a Python dict.
:param size: the maximum number of the documents
:param sampling_rate: the sampling rate between [0, 1]
"""
...
[docs] @classmethod
def from_lines(cls: Type['T'], *args, **kwargs) -> 'T':
"""
# noqa: DAR101
# noqa: DAR102
# noqa: DAR201
"""
return cls._from_generator('from_lines', *args, **kwargs)
def _raise_attribute_error(*args, **kwargs) -> None:
raise AttributeError(
f'Class method can\'t be called from a DocumentArray'
f' instance but only from the DocumentArray class.'
)