Source code for docarray.array.mixins.match

from typing import Optional, Union, Callable, Tuple, TYPE_CHECKING, Dict

if TYPE_CHECKING:  # pragma: no cover
    import numpy as np
    from docarray.typing import ArrayType
    from docarray import DocumentArray


[docs]class MatchMixin:
    """A mixin that provides match functionality to DocumentArrays"""

[docs]    def match(
        self,
        darray: 'DocumentArray',
        metric: Union[
            str, Callable[['ArrayType', 'ArrayType'], 'np.ndarray']
        ] = 'cosine',
        limit: Optional[Union[int, float]] = 20,
        normalization: Optional[Tuple[float, float]] = None,
        metric_name: Optional[str] = None,
        batch_size: Optional[int] = None,
        exclude_self: bool = False,
        filter: Optional[Dict] = None,
        only_id: bool = False,
        use_scipy: bool = False,
        device: str = 'cpu',
        num_worker: Optional[int] = 1,
        on: Optional[str] = None,
        **kwargs,
    ) -> None:
        """Compute embedding based nearest neighbour in `another` for each Document in `self`,
        and store results in `matches`.
        For the purpose of evaluation, one can also directly use the
        :meth:`~docarray.array.mixins.evaluation.EvaluationMixin.embed_and_evaluate`
        function.
        .. note::
            'cosine', 'euclidean', 'sqeuclidean' are supported natively without extra dependency.
            You can use other distance metric provided by ``scipy``, such as `braycurtis`, `canberra`, `chebyshev`,
            `cityblock`, `correlation`, `cosine`, `dice`, `euclidean`, `hamming`, `jaccard`, `jensenshannon`,
            `kulsinski`, `mahalanobis`, `matching`, `minkowski`, `rogerstanimoto`, `russellrao`, `seuclidean`,
            `sokalmichener`, `sokalsneath`, `sqeuclidean`, `wminkowski`, `yule`.
            To use scipy metric, please set ``use_scipy=True``.
        - To make all matches values in [0, 1], use ``dA.match(dB, normalization=(0, 1))``
        - To invert the distance as score and make all values in range [0, 1],
            use ``dA.match(dB, normalization=(1, 0))``. Note, how ``normalization`` differs from the previous.
        - If a custom metric distance is provided. Make sure that it returns scores as distances and not similarity, meaning the smaller the better.
        :param darray: the other DocumentArray  to match against
        :param metric: the distance metric
        :param limit: the maximum number of matches, when not given defaults to 20.
        :param normalization: a tuple [a, b] to be used with min-max normalization,
                                the min distance will be rescaled to `a`, the max distance will be rescaled to `b`
                                all values will be rescaled into range `[a, b]`.
        :param metric_name: if provided, then match result will be marked with this string.
        :param batch_size: if provided, then ``darray`` is loaded in batches, where each of them is at most ``batch_size``
            elements. When `darray` is big, this can significantly speedup the computation.
        :param exclude_self: if set, Documents in ``darray`` with same ``id`` as the left-hand values will not be
                        considered as matches.
        :param filter: filter query used for pre-filtering
        :param only_id: if set, then returning matches will only contain ``id``
        :param use_scipy: if set, use ``scipy`` as the computation backend. Note, ``scipy`` does not support distance
            on sparse matrix.
        :param device: the computational device for ``.match()``, can be either `cpu` or `cuda`.
        :param num_worker: the number of parallel workers. If not given, then the number of CPUs in the system will be used.

                .. note::
                    This argument is only effective when ``batch_size`` is set.
        :param on: specifies a subindex to search on. If set, the returned DocumentArray will be retrieved from the given subindex.
        :param kwargs: other kwargs.
        """

        if not (self and darray):
            return

        for d in self:
            d.matches.clear()

        match_docs = darray.find(
            self,
            metric=metric,
            limit=limit,
            normalization=normalization,
            metric_name=metric_name,
            batch_size=batch_size,
            exclude_self=exclude_self,
            filter=filter,
            only_id=only_id,
            use_scipy=use_scipy,
            device=device,
            num_worker=num_worker,
            on=on,
            **kwargs,
        )

        if not isinstance(match_docs, list):
            match_docs = [match_docs]

        for m, d in zip(match_docs, self):
            d.matches = m