Source code for docarray.math.ndarray

from typing import TYPE_CHECKING, Tuple, Sequence, Optional, List, Any

import numpy as np

if TYPE_CHECKING:  # pragma: no cover
    from docarray.typing import ArrayType
    from docarray import Document, DocumentArray


[docs]def unravel(docs: Sequence['Document'], field: str) -> Optional['ArrayType']: _first = getattr(docs[0], field) if _first is None: # failed to unravel, return as a list r = [getattr(d, field) for d in docs] if any(_rr is not None for _rr in r): return r else: return None framework, is_sparse = get_array_type(_first) cls_type = type(_first) all_fields = [getattr(d, field) for d in docs] none_idx = [idx for idx, v in enumerate(all_fields) if v is None] if none_idx: raise ValueError( f'Document{none_idx}.{field} is None. Can not stack into `{field}s`.' ) if framework == 'python': return cls_type(all_fields) elif framework == 'numpy': return np.stack(all_fields) elif framework == 'tensorflow': import tensorflow as tf return tf.stack(all_fields) elif framework == 'torch': import torch return torch.stack(all_fields) elif framework == 'paddle': import paddle return paddle.stack(all_fields) elif framework == 'scipy': import scipy.sparse return cls_type(scipy.sparse.vstack(all_fields))
[docs]def ravel(value: 'ArrayType', docs: 'DocumentArray', field: str) -> None: """Ravel :attr:`value` into ``doc.field`` of each documents :param docs: the docs to set :param field: the field of the doc to set :param value: the value to be set on ``doc.field`` """ use_get_row = False if hasattr(value, 'getformat'): # for scipy only sp_format = value.getformat() if sp_format in {'bsr', 'coo'}: # for BSR and COO, they dont implement [j, ...] in scipy # but they offer get_row() API which implicitly translate the # sparse row into CSR format, hence needs to convert back # not very efficient, but this is the best we can do. use_get_row = True if use_get_row: emb_shape0 = value.shape[0] for d, j in zip(docs, range(emb_shape0)): row = getattr(value.getrow(j), f'to{sp_format}')() docs[d.id, field] = row elif isinstance(value, (list, tuple)): for d, j in zip(docs, value): docs[d.id, field] = j else: emb_shape0 = value.shape[0] for d, j in zip(docs, range(emb_shape0)): docs[d.id, field] = value[j, ...]
[docs]def get_array_type( array: 'ArrayType', raise_error_if_not_array: bool = True ) -> Tuple[str, bool]: """Get the type of ndarray without importing the framework :param array: any array, scipy, numpy, tf, torch, etc. :return: a tuple where the first element represents the framework, the second represents if it is sparse array """ module_tags = array.__class__.__module__.split('.') class_name = array.__class__.__name__ if isinstance(array, (list, tuple)): return 'python', False if 'numpy' in module_tags: return 'numpy', False if 'docarray' in module_tags: if class_name == 'NdArray': return 'docarray', False # sparse or not is irrelevant if 'docarray_pb2' in module_tags: if class_name == 'NdArrayProto': return 'docarray_proto', False # sparse or not is irrelevant if 'tensorflow' in module_tags: if class_name == 'SparseTensor': return 'tensorflow', True if class_name == 'Tensor' or class_name == 'EagerTensor': return 'tensorflow', False if 'torch' in module_tags and class_name == 'Tensor': return 'torch', array.is_sparse if 'paddle' in module_tags and class_name == 'Tensor': # Paddle does not support sparse tensor on 11/8/2021 # https://github.com/PaddlePaddle/Paddle/issues/36697 return 'paddle', False if 'scipy' in module_tags and 'sparse' in module_tags: return 'scipy', True if raise_error_if_not_array: if array is not None: raise TypeError( f'can not determine the array type: {module_tags}.{class_name}' ) else: raise ValueError( f'Empty ndarray. Did you forget to set .embedding/.tensor value and now you are operating on it?' ) else: return 'python', False
[docs]def to_numpy_array(value) -> 'np.ndarray': """Return the value always in :class:`numpy.ndarray` regardless the framework type. :return: the value in :class:`numpy.ndarray`. """ v = value framework, is_sparse = get_array_type(value) if is_sparse: if hasattr(v, 'todense'): v = v.todense() elif hasattr(v, 'to_dense'): v = v.to_dense() elif framework == 'tensorflow': import tensorflow as tf if isinstance(v, tf.SparseTensor): v = tf.sparse.to_dense(v) if hasattr(v, 'numpy'): v = v.numpy() if framework == 'python': v = np.array(v) return v
[docs]def to_list(value) -> List[float]: r = to_numpy_array(value) if isinstance(r, np.ndarray): return r.tolist() elif isinstance(r, list): return r else: raise TypeError(f'{r} can not be converted into list')
[docs]def get_array_rows(array: 'ArrayType') -> Tuple[int, int]: """Get the number of rows of the ndarray without importing all frameworks :param array: input array :return: (num_rows, ndim) Examples >>> get_array_rows([1,2,3]) 1, 1 >>> get_array_rows([[1,2,3], [4,5,6]]) 2, 2 >>> get_array_rows([[1,2,3], [4,5,6], [7,8,9]]) 3, 2 >>> get_array_rows(np.array([[1,2,3], [4,5,6], [7,8,9]])) 3, 2 """ array_type, _ = get_array_type(array) if array_type == 'python': first_element_list_like = isinstance(array[0], (list, tuple)) num_rows = len(array) if first_element_list_like else 1 ndim = 2 if first_element_list_like else 1 elif array_type in ('numpy', 'tensorflow', 'torch', 'paddle', 'scipy'): ndim = array.ndim if ndim == 1: num_rows = 1 else: num_rows = array.shape[0] else: raise ValueError return num_rows, ndim
[docs]def check_arraylike_equality(x: 'ArrayType', y: 'ArrayType'): """Check if two array type objects are the same with the supported frameworks. Examples >>> import numpy as np x = np.array([[1,2,0,0,3],[1,2,0,0,3]]) check_arraylike_equality(x,x) True >>> from scipy import sparse as sp x = sp.csr_matrix([[1,2,0,0,3],[1,2,0,0,3]]) check_arraylike_equality(x,x) True >>> import torch x = torch.tensor([1,2,3]) check_arraylike_equality(x,x) True """ x_type, x_is_sparse = get_array_type(x) y_type, y_is_sparse = get_array_type(y) same_array = False if x_type == y_type and x_is_sparse == y_is_sparse: if x_type == 'python': same_array = x == y if x_type == 'numpy': # Numpy does not support sparse tensors import numpy as np same_array = np.array_equal(x, y) elif x_type == 'torch': import torch if x_is_sparse: # torch.equal NotImplementedError for sparse same_array = all((x - y).coalesce().values() == 0) else: same_array = torch.equal(x, y) elif x_type == 'scipy': # Not implemented in scipy this should work for all types # Note: you can't simply look at nonzero values because they can be in # different positions. if x.shape != y.shape: same_array = False else: same_array = (x != y).nnz == 0 elif x_type == 'tensorflow': if x_is_sparse: same_array = x == y else: # Does not have equal implemented, only elementwise, therefore reduce .all is needed same_array = (x == y).numpy().all() elif x_type == 'paddle': # Paddle does not support sparse tensor on 11/8/2021 # https://github.com/PaddlePaddle/Paddle/issues/36697 # Does not have equal implemented, only elementwise, therefore reduce .all is needed same_array = (x == y).numpy().all() return same_array else: return same_array
[docs]def detach_tensor_if_present(x: Any) -> Any: """Check if input is a dense torch array and detaches the tensor from the current graph. :param array: input array :return: (num_rows, ndim) """ x_type, x_sparse = get_array_type(x, raise_error_if_not_array=False) if x_type == 'torch' and not x_sparse: import torch x = torch.tensor(x.detach().numpy()) return x