Source code for docarray.array.storage.opensearch.getsetdel

from typing import Sequence, Iterable, Dict, List

from docarray.array.storage.base.getsetdel import BaseGetSetDelMixin
from docarray.array.storage.base.helper import Offset2ID
from docarray import Document
import numpy as np


[docs]class GetSetDelMixin(BaseGetSetDelMixin):

    MAX_OPENSEARCH_RETURNED_DOCS = 10000

    def _getitem(self, doc_id: str) -> 'Document':
        """Helper method for getting item with OpenSearch as storage
        :param doc_id:  id of the document
        :raises KeyError: raise error when opensearch id does not exist in storage
        :return: Document
        """
        try:
            result = self._client.get(index=self._config.index_name, id=doc_id)
            doc = Document.from_base64(result['_source']['blob'])
            return doc
        except Exception as ex:
            raise KeyError(doc_id) from ex

    def _get_doc_by_id(self, _id: str) -> 'Document':
        """Concrete implementation of base class' ``_get_doc_by_id``
        :param _id: the id of the document
        :return: the retrieved document from opensearch
        """
        return self._getitem(_id)

    def _get_docs_by_ids(self, ids: Sequence[str]) -> Iterable[Document]:
        """Concrete implementation of base class' ``_get_docs_by_ids``
        :param ids:  ids of the document
        :return: Iterable[Document]
        """
        accumulated_docs = []
        accumulated_docs_id_not_found = []

        if not ids:
            return accumulated_docs

        # Handle if doc len is more than MAX_ES_RETURNED_DOCS
        for pos in range(0, len(ids), self.MAX_OPENSEARCH_RETURNED_DOCS):
            es_docs = self._client.mget(
                body={'ids': ids[pos : pos + self.MAX_OPENSEARCH_RETURNED_DOCS]},
                index=self._config.index_name,
            )['docs']
            for doc in es_docs:
                if doc['found']:
                    accumulated_docs.append(
                        Document.from_base64(doc['_source']['blob'])
                    )
                else:
                    accumulated_docs_id_not_found.append(doc['_id'])

        if accumulated_docs_id_not_found:
            raise KeyError(accumulated_docs_id_not_found, accumulated_docs)

        return accumulated_docs

    def _del_doc_by_id(self, _id: str):
        """Concrete implementation of base class' ``_del_doc_by_id``
        :param _id: the id of the document to delete
        """
        if self._doc_id_exists(_id):
            self._client.delete(index=self._config.index_name, id=_id)
        self._refresh(self._config.index_name)

    def _set_doc_by_id(self, _id: str, value: Document):
        """Concrete implementation of base class' ``_set_doc_by_id``
        :param _id: the id of doc to update
        :param value: the document to update to
        """
        if _id != value.id:
            self._del_doc_by_id(_id)

        request = [self._document_to_opensearch_request(value)]

        self._send_requests(request)
        self._refresh(self._config.index_name)

    def _set_docs_by_ids(self, ids, docs: Iterable[Document], mismatch_ids: Dict):
        """Overridden implementation of _set_docs_by_ids in order to add docs in batches and flush at the end
        :param ids: the ids used for indexing
        """
        for _id, doc in zip(ids, docs):
            self._set_doc_by_id(_id, doc)

        self._refresh(self._config.index_name)

    def _load_offset2ids(self):
        if self._list_like:
            ids = self._get_offset2ids_meta()
            self._offset2ids = Offset2ID(ids, list_like=self._list_like)
        else:
            self._offset2ids = Offset2ID([], list_like=self._list_like)

    def _save_offset2ids(self):
        if self._list_like:
            self._update_offset2ids_meta()

    def _document_to_opensearch_request(self, doc: Document) -> Dict:
        extra_columns = {
            col: doc.tags.get(col) for col, _ in self._config.columns.items()
        }
        request = {
            '_op_type': 'index',
            '_id': doc.id,
            '_index': self._config.index_name,
            'embedding': self._map_embedding(doc.embedding),
            'blob': doc.to_base64(),
            **extra_columns,
        }

        if self._config.tag_indices:
            for index in self._config.tag_indices:
                request[index] = doc.tags.get(index)

        if doc.text:
            request['text'] = doc.text
        return request