Skip to content

Embedding

docarray.typing.tensor.embedding.embedding

AnyEmbedding

Bases: AnyTensor, EmbeddingMixin

Represents an embedding tensor object that can be used with TensorFlow, PyTorch, and NumPy type.


'''python from docarray import BaseDoc from docarray.typing import AnyEmbedding

class MyEmbeddingDoc(BaseDoc): embedding: AnyEmbedding

Example usage with TensorFlow:

import tensorflow as tf

doc = MyEmbeddingDoc(embedding=tf.zeros(1000, 2)) type(doc.embedding) # TensorFlowEmbedding

Example usage with PyTorch:

import torch

doc = MyEmbeddingDoc(embedding=torch.zeros(1000, 2)) type(doc.embedding) # TorchEmbedding

Example usage with NumPy:

import numpy as np

doc = MyEmbeddingDoc(embedding=np.zeros((1000, 2))) type(doc.embedding) # NdArrayEmbedding '''


Raises: TypeError: If the type of the value is not one of [torch.Tensor, tensorflow.Tensor, numpy.ndarray]

Source code in docarray/typing/tensor/embedding/embedding.py
class AnyEmbedding(AnyTensor, EmbeddingMixin):
    """
    Represents an embedding tensor object that can be used with TensorFlow, PyTorch, and NumPy type.

    ---
    '''python
    from docarray import BaseDoc
    from docarray.typing import AnyEmbedding


    class MyEmbeddingDoc(BaseDoc):
        embedding: AnyEmbedding


    # Example usage with TensorFlow:
    import tensorflow as tf

    doc = MyEmbeddingDoc(embedding=tf.zeros(1000, 2))
    type(doc.embedding)  # TensorFlowEmbedding

    # Example usage with PyTorch:
    import torch

    doc = MyEmbeddingDoc(embedding=torch.zeros(1000, 2))
    type(doc.embedding)  # TorchEmbedding

    # Example usage with NumPy:
    import numpy as np

    doc = MyEmbeddingDoc(embedding=np.zeros((1000, 2)))
    type(doc.embedding)  # NdArrayEmbedding
    '''
    ---

    Raises:
        TypeError: If the type of the value is not one of [torch.Tensor, tensorflow.Tensor, numpy.ndarray]
    """

    @classmethod
    def _docarray_validate(
        cls: Type[T],
        value: Union[T, np.ndarray, Any],
    ):
        if torch_available:
            if isinstance(value, TorchTensor):
                return cast(TorchEmbedding, value)
            elif isinstance(value, torch.Tensor):
                return TorchEmbedding._docarray_from_native(value)  # noqa
        if tf_available:
            if isinstance(value, TensorFlowTensor):
                return cast(TensorFlowEmbedding, value)
            elif isinstance(value, tf.Tensor):
                return TensorFlowEmbedding._docarray_from_native(value)  # noqa
        if jax_available:
            if isinstance(value, JaxArray):
                return cast(JaxArrayEmbedding, value)
            elif isinstance(value, jnp.ndarray):
                return JaxArrayEmbedding._docarray_from_native(value)  # noqa
        try:
            return NdArrayEmbedding._docarray_validate(value)
        except Exception:  # noqa
            pass
        raise TypeError(
            f"Expected one of [torch.Tensor, tensorflow.Tensor, numpy.ndarray] "
            f"compatible type, got {type(value)}"
        )

__docarray_validate_shape__(t, shape) classmethod

Every tensor has to implement this method in order to enable syntax of the form AnyTensor[shape]. It is called when a tensor is assigned to a field of this type. i.e. when a tensor is passed to a Document field of type AnyTensor[shape].

The intended behaviour is as follows:

  • If the shape of t is equal to shape, return t.
  • If the shape of t is not equal to shape, but can be reshaped to shape, return t reshaped to shape.
  • If the shape of t is not equal to shape and cannot be reshaped to shape, raise a ValueError.

Parameters:

Name Type Description Default
t T

The tensor to validate.

required
shape Tuple[Union[int, str], ...]

The shape to validate against.

required

Returns:

Type Description
T

The validated tensor.

Source code in docarray/typing/tensor/abstract_tensor.py
@classmethod
def __docarray_validate_shape__(cls, t: T, shape: Tuple[Union[int, str], ...]) -> T:
    """Every tensor has to implement this method in order to
    enable syntax of the form AnyTensor[shape].
    It is called when a tensor is assigned to a field of this type.
    i.e. when a tensor is passed to a Document field of type AnyTensor[shape].

    The intended behaviour is as follows:

    - If the shape of `t` is equal to `shape`, return `t`.
    - If the shape of `t` is not equal to `shape`,
        but can be reshaped to `shape`, return `t` reshaped to `shape`.
    - If the shape of `t` is not equal to `shape`
        and cannot be reshaped to `shape`, raise a ValueError.

    :param t: The tensor to validate.
    :param shape: The shape to validate against.
    :return: The validated tensor.
    """
    comp_be = t.get_comp_backend()
    tshape = comp_be.shape(t)
    if tshape == shape:
        return t
    elif any(isinstance(dim, str) or dim == Ellipsis for dim in shape):
        ellipsis_occurrences = [
            pos for pos, dim in enumerate(shape) if dim == Ellipsis
        ]
        if ellipsis_occurrences:
            if len(ellipsis_occurrences) > 1:
                raise ValueError(
                    f'Cannot use Ellipsis (...) more than once for the shape {shape}'
                )
            ellipsis_pos = ellipsis_occurrences[0]
            # Calculate how many dimensions to add. Should be at least 1.
            dimensions_needed = max(len(tshape) - len(shape) + 1, 1)
            shape = (
                shape[:ellipsis_pos]
                + tuple(
                    f'__dim_var_{index}__' for index in range(dimensions_needed)
                )
                + shape[ellipsis_pos + 1 :]
            )

        if len(tshape) != len(shape):
            raise ValueError(
                f'Tensor shape mismatch. Expected {shape}, got {tshape}'
            )
        known_dims: Dict[str, int] = {}
        for tdim, dim in zip(tshape, shape):
            if isinstance(dim, int) and tdim != dim:
                raise ValueError(
                    f'Tensor shape mismatch. Expected {shape}, got {tshape}'
                )
            elif isinstance(dim, str):
                if dim in known_dims and known_dims[dim] != tdim:
                    raise ValueError(
                        f'Tensor shape mismatch. Expected {shape}, got {tshape}'
                    )
                else:
                    known_dims[dim] = tdim
        else:
            return t
    else:
        shape = cast(Tuple[int], shape)
        warnings.warn(
            f'Tensor shape mismatch. Reshaping tensor '
            f'of shape {tshape} to shape {shape}'
        )
        try:
            value = cls._docarray_from_native(comp_be.reshape(t, shape))
            return cast(T, value)
        except RuntimeError:
            raise ValueError(
                f'Cannot reshape tensor of shape {tshape} to shape {shape}'
            )

unwrap()

Return the native tensor object that this DocList tensor wraps.

Source code in docarray/typing/tensor/abstract_tensor.py
def unwrap(self):
    """Return the native tensor object that this DocList tensor wraps."""

docarray.typing.tensor.embedding.embedding_mixin

docarray.typing.tensor.embedding.ndarray

docarray.typing.tensor.embedding.tensorflow

docarray.typing.tensor.embedding.torch

TorchEmbedding

Bases: TorchTensor, EmbeddingMixin

Source code in docarray/typing/tensor/embedding/torch.py
@_register_proto(proto_type_name='torch_embedding')
class TorchEmbedding(TorchTensor, EmbeddingMixin, metaclass=metaTorchAndEmbedding):
    alternative_type = TorchTensor

    def new_empty(self, *args, **kwargs):
        """
        This method enables the deepcopy of `TorchEmbedding` by returning another instance of this subclass.
        If this function is not implemented, the deepcopy will throw an RuntimeError from Torch.
        """
        return self.__class__(TorchTensor.new_empty(self, *args, **kwargs))

__deepcopy__(memo)

Custom implementation of deepcopy for TorchTensor to avoid storage sharing issues.

Source code in docarray/typing/tensor/torch_tensor.py
def __deepcopy__(self, memo):
    """
    Custom implementation of deepcopy for TorchTensor to avoid storage sharing issues.
    """
    # Create a new tensor with the same data and properties
    new_tensor = self.clone()
    # Set the class to the custom TorchTensor class
    new_tensor.__class__ = self.__class__
    return new_tensor

__docarray_validate_shape__(t, shape) classmethod

Every tensor has to implement this method in order to enable syntax of the form AnyTensor[shape]. It is called when a tensor is assigned to a field of this type. i.e. when a tensor is passed to a Document field of type AnyTensor[shape].

The intended behaviour is as follows:

  • If the shape of t is equal to shape, return t.
  • If the shape of t is not equal to shape, but can be reshaped to shape, return t reshaped to shape.
  • If the shape of t is not equal to shape and cannot be reshaped to shape, raise a ValueError.

Parameters:

Name Type Description Default
t T

The tensor to validate.

required
shape Tuple[Union[int, str], ...]

The shape to validate against.

required

Returns:

Type Description
T

The validated tensor.

Source code in docarray/typing/tensor/abstract_tensor.py
@classmethod
def __docarray_validate_shape__(cls, t: T, shape: Tuple[Union[int, str], ...]) -> T:
    """Every tensor has to implement this method in order to
    enable syntax of the form AnyTensor[shape].
    It is called when a tensor is assigned to a field of this type.
    i.e. when a tensor is passed to a Document field of type AnyTensor[shape].

    The intended behaviour is as follows:

    - If the shape of `t` is equal to `shape`, return `t`.
    - If the shape of `t` is not equal to `shape`,
        but can be reshaped to `shape`, return `t` reshaped to `shape`.
    - If the shape of `t` is not equal to `shape`
        and cannot be reshaped to `shape`, raise a ValueError.

    :param t: The tensor to validate.
    :param shape: The shape to validate against.
    :return: The validated tensor.
    """
    comp_be = t.get_comp_backend()
    tshape = comp_be.shape(t)
    if tshape == shape:
        return t
    elif any(isinstance(dim, str) or dim == Ellipsis for dim in shape):
        ellipsis_occurrences = [
            pos for pos, dim in enumerate(shape) if dim == Ellipsis
        ]
        if ellipsis_occurrences:
            if len(ellipsis_occurrences) > 1:
                raise ValueError(
                    f'Cannot use Ellipsis (...) more than once for the shape {shape}'
                )
            ellipsis_pos = ellipsis_occurrences[0]
            # Calculate how many dimensions to add. Should be at least 1.
            dimensions_needed = max(len(tshape) - len(shape) + 1, 1)
            shape = (
                shape[:ellipsis_pos]
                + tuple(
                    f'__dim_var_{index}__' for index in range(dimensions_needed)
                )
                + shape[ellipsis_pos + 1 :]
            )

        if len(tshape) != len(shape):
            raise ValueError(
                f'Tensor shape mismatch. Expected {shape}, got {tshape}'
            )
        known_dims: Dict[str, int] = {}
        for tdim, dim in zip(tshape, shape):
            if isinstance(dim, int) and tdim != dim:
                raise ValueError(
                    f'Tensor shape mismatch. Expected {shape}, got {tshape}'
                )
            elif isinstance(dim, str):
                if dim in known_dims and known_dims[dim] != tdim:
                    raise ValueError(
                        f'Tensor shape mismatch. Expected {shape}, got {tshape}'
                    )
                else:
                    known_dims[dim] = tdim
        else:
            return t
    else:
        shape = cast(Tuple[int], shape)
        warnings.warn(
            f'Tensor shape mismatch. Reshaping tensor '
            f'of shape {tshape} to shape {shape}'
        )
        try:
            value = cls._docarray_from_native(comp_be.reshape(t, shape))
            return cast(T, value)
        except RuntimeError:
            raise ValueError(
                f'Cannot reshape tensor of shape {tshape} to shape {shape}'
            )

__getitem__(item) abstractmethod

Get a slice of this tensor.

Source code in docarray/typing/tensor/abstract_tensor.py
@abc.abstractmethod
def __getitem__(self: T, item) -> T:
    """Get a slice of this tensor."""
    ...

__iter__() abstractmethod

Iterate over the elements of this tensor.

Source code in docarray/typing/tensor/abstract_tensor.py
@abc.abstractmethod
def __iter__(self):
    """Iterate over the elements of this tensor."""
    ...

__setitem__(index, value) abstractmethod

Set a slice of this tensor.

Source code in docarray/typing/tensor/abstract_tensor.py
@abc.abstractmethod
def __setitem__(self, index, value):
    """Set a slice of this tensor."""
    ...

from_ndarray(value) classmethod

Create a TorchTensor from a numpy array

Parameters:

Name Type Description Default
value ndarray

the numpy array

required

Returns:

Type Description
T

a TorchTensor

Source code in docarray/typing/tensor/torch_tensor.py
@classmethod
def from_ndarray(cls: Type[T], value: np.ndarray) -> T:
    """Create a `TorchTensor` from a numpy array

    :param value: the numpy array
    :return: a `TorchTensor`
    """
    return cls._docarray_from_native(torch.from_numpy(value))

from_protobuf(pb_msg) classmethod

Read ndarray from a proto msg

Parameters:

Name Type Description Default
pb_msg NdArrayProto
required

Returns:

Type Description
T

a TorchTensor

Source code in docarray/typing/tensor/torch_tensor.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'NdArrayProto') -> 'T':
    """
    Read ndarray from a proto msg
    :param pb_msg:
    :return: a `TorchTensor`
    """
    source = pb_msg.dense
    if source.buffer:
        x = np.frombuffer(bytearray(source.buffer), dtype=source.dtype)
        return cls.from_ndarray(x.reshape(source.shape))
    elif len(source.shape) > 0:
        return cls.from_ndarray(np.zeros(source.shape))
    else:
        raise ValueError(f'proto message {pb_msg} cannot be cast to a TorchTensor')

get_comp_backend() staticmethod

Return the computational backend of the tensor

Source code in docarray/typing/tensor/torch_tensor.py
@staticmethod
def get_comp_backend() -> 'TorchCompBackend':
    """Return the computational backend of the tensor"""
    from docarray.computation.torch_backend import TorchCompBackend

    return TorchCompBackend()

new_empty(*args, **kwargs)

This method enables the deepcopy of TorchEmbedding by returning another instance of this subclass. If this function is not implemented, the deepcopy will throw an RuntimeError from Torch.

Source code in docarray/typing/tensor/embedding/torch.py
def new_empty(self, *args, **kwargs):
    """
    This method enables the deepcopy of `TorchEmbedding` by returning another instance of this subclass.
    If this function is not implemented, the deepcopy will throw an RuntimeError from Torch.
    """
    return self.__class__(TorchTensor.new_empty(self, *args, **kwargs))

to_protobuf()

Transform self into a NdArrayProto protobuf message

Source code in docarray/typing/tensor/torch_tensor.py
def to_protobuf(self) -> 'NdArrayProto':
    """
    Transform self into a `NdArrayProto` protobuf message
    """
    from docarray.proto import NdArrayProto

    nd_proto = NdArrayProto()

    value_np = self.detach().cpu().numpy()
    nd_proto.dense.buffer = value_np.tobytes()
    nd_proto.dense.ClearField('shape')
    nd_proto.dense.shape.extend(list(value_np.shape))
    nd_proto.dense.dtype = value_np.dtype.str

    return nd_proto

unwrap()

Return the original torch.Tensor without any memory copy.

The original view rest intact and is still a Document TorchTensor but the return object is a pure torch.Tensor but both object share the same memory layout.


from docarray.typing import TorchTensor
import torch
from pydantic import parse_obj_as


t = parse_obj_as(TorchTensor, torch.zeros(3, 224, 224))
# here t is a docarray TorchTensor
t2 = t.unwrap()
# here t2 is a pure torch.Tensor but t1 is still a Docarray TorchTensor
# But both share the same underlying memory

Returns:

Type Description
Tensor

a torch.Tensor

Source code in docarray/typing/tensor/torch_tensor.py
def unwrap(self) -> torch.Tensor:
    """
    Return the original `torch.Tensor` without any memory copy.

    The original view rest intact and is still a Document `TorchTensor`
    but the return object is a pure `torch.Tensor` but both object share
    the same memory layout.

    ---

    ```python
    from docarray.typing import TorchTensor
    import torch
    from pydantic import parse_obj_as


    t = parse_obj_as(TorchTensor, torch.zeros(3, 224, 224))
    # here t is a docarray TorchTensor
    t2 = t.unwrap()
    # here t2 is a pure torch.Tensor but t1 is still a Docarray TorchTensor
    # But both share the same underlying memory
    ```

    ---

    :return: a `torch.Tensor`
    """
    value = copy(self)  # as unintuitive as it sounds, this
    # does not do any relevant memory copying, just shallow
    # reference to the torch data
    value.__class__ = torch.Tensor  # type: ignore
    return value