Skip to content

Documents

docarray.documents

AudioDoc

Bases: BaseDoc

Document for handling audios.

The Audio Document can contain:

You can use this Document directly:

from docarray.documents import AudioDoc

# use it directly
audio = AudioDoc(
    url='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.wav?raw=true'
)
audio.tensor, audio.frame_rate = audio.url.load()
# model = MyEmbeddingModel()
# audio.embedding = model(audio.tensor)

You can extend this Document:

from docarray.documents import AudioDoc, TextDoc
from typing import Optional


# extend it
class MyAudio(AudioDoc):
    name: Optional[TextDoc] = None


audio = MyAudio(
    url='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.wav?raw=true'
)
audio.name = TextDoc(text='my first audio')
audio.tensor, audio.frame_rate = audio.url.load()
# model = MyEmbeddingModel()
# audio.embedding = model(audio.tensor)

You can use this Document for composition:

from docarray import BaseDoc
from docarray.documents import AudioDoc, TextDoc


# compose it
class MultiModalDoc(BaseDoc):
    audio: AudioDoc
    text: TextDoc


mmdoc = MultiModalDoc(
    audio=AudioDoc(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.wav?raw=true'
    ),
    text=TextDoc(text='hello world, how are you doing?'),
)
mmdoc.audio.tensor, mmdoc.audio.frame_rate = mmdoc.audio.url.load()

# equivalent to
mmdoc.audio.bytes_ = mmdoc.audio.url.load_bytes()
mmdoc.audio.tensor, mmdoc.audio.frame_rate = mmdoc.audio.bytes_.load()
Source code in docarray/documents/audio.py
class AudioDoc(BaseDoc):
    """
    Document for handling audios.

    The Audio Document can contain:

    - an [`AudioUrl`][docarray.typing.url.AudioUrl] (`AudioDoc.url`)
    - an [`AudioTensor`](../../../api_references/typing/tensor/audio) (`AudioDoc.tensor`)
    - an [`AnyEmbedding`](../../../api_references/typing/tensor/embedding) (`AudioDoc.embedding`)
    - an [`AudioBytes`][docarray.typing.bytes.AudioBytes] (`AudioDoc.bytes_`) object
    - an integer representing the frame_rate (`AudioDoc.frame_rate`)

    You can use this Document directly:

    ```python
    from docarray.documents import AudioDoc

    # use it directly
    audio = AudioDoc(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.wav?raw=true'
    )
    audio.tensor, audio.frame_rate = audio.url.load()
    # model = MyEmbeddingModel()
    # audio.embedding = model(audio.tensor)
    ```

    You can extend this Document:

    ```python
    from docarray.documents import AudioDoc, TextDoc
    from typing import Optional


    # extend it
    class MyAudio(AudioDoc):
        name: Optional[TextDoc] = None


    audio = MyAudio(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.wav?raw=true'
    )
    audio.name = TextDoc(text='my first audio')
    audio.tensor, audio.frame_rate = audio.url.load()
    # model = MyEmbeddingModel()
    # audio.embedding = model(audio.tensor)
    ```

    You can use this Document for composition:

    ```python
    from docarray import BaseDoc
    from docarray.documents import AudioDoc, TextDoc


    # compose it
    class MultiModalDoc(BaseDoc):
        audio: AudioDoc
        text: TextDoc


    mmdoc = MultiModalDoc(
        audio=AudioDoc(
            url='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.wav?raw=true'
        ),
        text=TextDoc(text='hello world, how are you doing?'),
    )
    mmdoc.audio.tensor, mmdoc.audio.frame_rate = mmdoc.audio.url.load()

    # equivalent to
    mmdoc.audio.bytes_ = mmdoc.audio.url.load_bytes()
    mmdoc.audio.tensor, mmdoc.audio.frame_rate = mmdoc.audio.bytes_.load()
    ```
    """

    url: Optional[AudioUrl] = Field(
        description='The url to a (potentially remote) audio file that can be loaded',
        example='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.mp3?raw=true',
        default=None,
    )
    tensor: Optional[AudioTensor] = Field(
        description='Tensor object of the audio which can be specified to one of `AudioNdArray`, `AudioTorchTensor`, `AudioTensorFlowTensor`',
        default=None,
    )
    embedding: Optional[AnyEmbedding] = Field(
        description='Store an embedding: a vector representation of the audio.',
        example=[0, 1, 0],
        default=None,
    )
    bytes_: Optional[AudioBytes] = Field(
        description='Bytes representation pf the audio',
        default=None,
    )
    frame_rate: Optional[int] = Field(
        description='An integer representing the frame rate of the audio.',
        example=24,
        default=None,
    )

    @classmethod
    def _validate(cls, value) -> Dict[str, Any]:
        if isinstance(value, str):
            value = dict(url=value)
        elif isinstance(value, (AbstractTensor, np.ndarray)) or (
            torch is not None
            and isinstance(value, torch.Tensor)
            or (tf is not None and isinstance(value, tf.Tensor))
        ):
            value = dict(tensor=value)

        return value

    if is_pydantic_v2:

        @model_validator(mode='before')
        @classmethod
        def validate_model_before(cls, value):
            return cls._validate(value)

    else:

        @classmethod
        def validate(
            cls: Type[T],
            value: Union[str, AbstractTensor, Any],
        ) -> T:
            return super().validate(cls._validate(value))

dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data

from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)

from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )

from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)

from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)

json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)

parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )

schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)

summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()

to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')

to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)

to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)

update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

ImageDoc

Bases: BaseDoc

Document for handling images.

It can contain:

You can use this Document directly:

from docarray.documents import ImageDoc

# use it directly
image = ImageDoc(
    url='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true'
)
image.tensor = image.url.load()
# model = MyEmbeddingModel()
# image.embedding = model(image.tensor)

You can extend this Document:

from docarray.documents import ImageDoc
from docarray.typing import AnyEmbedding
from typing import Optional


# extend it
class MyImage(ImageDoc):
    second_embedding: Optional[AnyEmbedding] = None


image = MyImage(
    url='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true'
)
image.tensor = image.url.load()
# model = MyEmbeddingModel()
# image.embedding = model(image.tensor)
# image.second_embedding = model(image.tensor)

You can use this Document for composition:

from docarray import BaseDoc
from docarray.documents import ImageDoc, TextDoc


# compose it
class MultiModalDoc(BaseDoc):
    image: ImageDoc
    text: TextDoc


mmdoc = MultiModalDoc(
    image=ImageDoc(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true'
    ),
    text=TextDoc(text='hello world, how are you doing?'),
)
mmdoc.image.tensor = mmdoc.image.url.load()

# or
mmdoc.image.bytes_ = mmdoc.image.url.load_bytes()
mmdoc.image.tensor = mmdoc.image.bytes_.load()
Source code in docarray/documents/image.py
class ImageDoc(BaseDoc):
    """
    Document for handling images.

    It can contain:

    - an [`ImageUrl`][docarray.typing.url.ImageUrl] (`Image.url`)
    - an [`ImageTensor`](../../../api_references/typing/tensor/image) (`Image.tensor`)
    - an [`AnyEmbedding`](../../../api_references/typing/tensor/embedding) (`Image.embedding`)
    - an [`ImageBytes`][docarray.typing.bytes.ImageBytes] object (`ImageDoc.bytes_`)

    You can use this Document directly:

    ```python
    from docarray.documents import ImageDoc

    # use it directly
    image = ImageDoc(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true'
    )
    image.tensor = image.url.load()
    # model = MyEmbeddingModel()
    # image.embedding = model(image.tensor)
    ```

    You can extend this Document:

    ```python
    from docarray.documents import ImageDoc
    from docarray.typing import AnyEmbedding
    from typing import Optional


    # extend it
    class MyImage(ImageDoc):
        second_embedding: Optional[AnyEmbedding] = None


    image = MyImage(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true'
    )
    image.tensor = image.url.load()
    # model = MyEmbeddingModel()
    # image.embedding = model(image.tensor)
    # image.second_embedding = model(image.tensor)
    ```

    You can use this Document for composition:

    ```python
    from docarray import BaseDoc
    from docarray.documents import ImageDoc, TextDoc


    # compose it
    class MultiModalDoc(BaseDoc):
        image: ImageDoc
        text: TextDoc


    mmdoc = MultiModalDoc(
        image=ImageDoc(
            url='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true'
        ),
        text=TextDoc(text='hello world, how are you doing?'),
    )
    mmdoc.image.tensor = mmdoc.image.url.load()

    # or
    mmdoc.image.bytes_ = mmdoc.image.url.load_bytes()
    mmdoc.image.tensor = mmdoc.image.bytes_.load()
    ```
    """

    url: Optional[ImageUrl] = Field(
        description='URL to a (potentially remote) image file that needs to be loaded',
        example='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true',
        default=None,
    )
    tensor: Optional[ImageTensor] = Field(
        description='Tensor object of the image which can be specifed to one of `ImageNdArray`, `ImageTorchTensor`, `ImageTensorflowTensor`.',
        default=None,
    )
    embedding: Optional[AnyEmbedding] = Field(
        description='Store an embedding: a vector representation of the image.',
        example=[1, 0, 1],
        default=None,
    )
    bytes_: Optional[ImageBytes] = Field(
        description='Bytes object of the image which is an instance of `ImageBytes`.',
        default=None,
    )

    @classmethod
    def _validate(cls, value) -> Dict[str, Any]:
        if isinstance(value, str):
            value = dict(url=value)
        elif (
            isinstance(value, (AbstractTensor, np.ndarray))
            or (torch is not None and isinstance(value, torch.Tensor))
            or (tf is not None and isinstance(value, tf.Tensor))
        ):
            value = dict(tensor=value)
        elif isinstance(value, bytes):
            value = dict(byte=value)

        return value

    if is_pydantic_v2:

        @model_validator(mode='before')
        @classmethod
        def validate_model_before(cls, value):
            return cls._validate(value)

    else:

        @classmethod
        def validate(
            cls: Type[T],
            value: Union[str, AbstractTensor, Any],
        ) -> T:
            return super().validate(cls._validate(value))

dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data

from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)

from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )

from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)

from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)

json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)

parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )

schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)

summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()

to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')

to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)

to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)

update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

Mesh3D

Bases: BaseDoc

Document for handling meshes for 3D data representation.

A mesh is a representation for 3D data and contains vertices and faces information. Vertices are points in a 3D space, represented as a tensor of shape (n_points, 3). Faces are triangular surfaces that can be defined by three points in 3D space, corresponding to the three vertices of a triangle. Faces can be represented as a tensor of shape (n_faces, 3). Each number in that tensor refers to an index of a vertex in the tensor of vertices.

The Mesh3D Document can contain:

You can use this Document directly:

from docarray.documents import Mesh3D

# use it directly
mesh = Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
mesh.tensors = mesh.url.load()
# model = MyEmbeddingModel()
# mesh.embedding = model(mesh.tensors.vertices)

You can extend this Document:

from docarray.documents import Mesh3D
from docarray.typing import AnyEmbedding
from typing import Optional


# extend it
class MyMesh3D(Mesh3D):
    name: Optional[str] = None


mesh = MyMesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
mesh.name = 'my first mesh'
mesh.tensors = mesh.url.load()
# model = MyEmbeddingModel()
# mesh.embedding = model(mesh.vertices)

You can use this Document for composition:

from docarray import BaseDoc
from docarray.documents import Mesh3D, TextDoc


# compose it
class MultiModalDoc(BaseDoc):
    mesh: Mesh3D
    text: TextDoc


mmdoc = MultiModalDoc(
    mesh=Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj'),
    text=TextDoc(text='hello world, how are you doing?'),
)
mmdoc.mesh.tensors = mmdoc.mesh.url.load()

# or
mmdoc.mesh.bytes_ = mmdoc.mesh.url.load_bytes()

You can display your 3D mesh in a notebook from either its url, or its tensors:

from docarray.documents import Mesh3D

# display from url
mesh = Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
# mesh.url.display()

# display from tensors
mesh.tensors = mesh.url.load()
# mesh.tensors.display()
Source code in docarray/documents/mesh/mesh_3d.py
class Mesh3D(BaseDoc):
    """
    Document for handling meshes for 3D data representation.

    A mesh is a representation for 3D data and contains vertices and faces information.
    Vertices are points in a 3D space, represented as a tensor of shape (n_points, 3).
    Faces are triangular surfaces that can be defined by three points in 3D space,
    corresponding to the three vertices of a triangle. Faces can be represented as a
    tensor of shape (n_faces, 3). Each number in that tensor refers to an index of a
    vertex in the tensor of vertices.

    The Mesh3D Document can contain:

    - an [`Mesh3DUrl`][docarray.typing.url.Mesh3DUrl] (`Mesh3D.url`)
    - a [`VerticesAndFaces`][docarray.documents.mesh.vertices_and_faces.VerticesAndFaces]
    object containing:

        - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor) of
        vertices (`Mesh3D.tensors.vertices`)
        - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor) of faces (`Mesh3D.tensors.faces`)

    - an [`AnyEmbedding`](../../../../api_references/typing/tensor/embedding) (`Mesh3D.embedding`)
    - a `bytes` object (`Mesh3D.bytes_`).

    You can use this Document directly:

    ```python
    from docarray.documents import Mesh3D

    # use it directly
    mesh = Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
    mesh.tensors = mesh.url.load()
    # model = MyEmbeddingModel()
    # mesh.embedding = model(mesh.tensors.vertices)
    ```

    You can extend this Document:

    ```python
    from docarray.documents import Mesh3D
    from docarray.typing import AnyEmbedding
    from typing import Optional


    # extend it
    class MyMesh3D(Mesh3D):
        name: Optional[str] = None


    mesh = MyMesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
    mesh.name = 'my first mesh'
    mesh.tensors = mesh.url.load()
    # model = MyEmbeddingModel()
    # mesh.embedding = model(mesh.vertices)
    ```

    You can use this Document for composition:

    ```python
    from docarray import BaseDoc
    from docarray.documents import Mesh3D, TextDoc


    # compose it
    class MultiModalDoc(BaseDoc):
        mesh: Mesh3D
        text: TextDoc


    mmdoc = MultiModalDoc(
        mesh=Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj'),
        text=TextDoc(text='hello world, how are you doing?'),
    )
    mmdoc.mesh.tensors = mmdoc.mesh.url.load()

    # or
    mmdoc.mesh.bytes_ = mmdoc.mesh.url.load_bytes()
    ```

    You can display your 3D mesh in a notebook from either its url, or its tensors:

    ```python
    from docarray.documents import Mesh3D

    # display from url
    mesh = Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
    # mesh.url.display()

    # display from tensors
    mesh.tensors = mesh.url.load()
    # mesh.tensors.display()
    ```

    """

    url: Optional[Mesh3DUrl] = Field(
        description='URL to a file containing 3D mesh information. Can be remote (web) URL, or a local file path.',
        example='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj',
        default=None,
    )
    tensors: Optional[VerticesAndFaces] = Field(
        description='A tensor object of 3D mesh of type `VerticesAndFaces`.',
        example=[[0, 1, 1], [1, 0, 1], [1, 1, 0]],
        default=None,
    )
    embedding: Optional[AnyEmbedding] = Field(
        description='Store an embedding: a vector representation of the 3D mesh.',
        default=[1, 0, 1],
    )
    bytes_: Optional[bytes] = Field(
        description='Bytes representation of 3D mesh.',
        default=None,
    )

    if is_pydantic_v2:

        @model_validator(mode='before')
        @classmethod
        def validate_model_before(cls, value):
            if isinstance(value, str):
                return {'url': value}
            return value

    else:

        @classmethod
        def validate(
            cls: Type[T],
            value: Union[str, Any],
        ) -> T:
            if isinstance(value, str):
                value = cls(url=value)
            return super().validate(value)

dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data

from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)

from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )

from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)

from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)

json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)

parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )

schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)

summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()

to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')

to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)

to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)

update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

PointCloud3D

Bases: BaseDoc

Document for handling point clouds for 3D data representation.

Point cloud is a representation of a 3D mesh. It is made by repeatedly and uniformly sampling points within the surface of the 3D body. Compared to the mesh representation, the point cloud is a fixed size ndarray of shape (n_samples, 3) and hence easier for deep learning algorithms to handle.

A PointCloud3D Document can contain:

You can use this Document directly:

from docarray.documents import PointCloud3D

# use it directly
pc = PointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
pc.tensors = pc.url.load(samples=100)
# model = MyEmbeddingModel()
# pc.embedding = model(pc.tensors.points)

You can extend this Document:

from docarray.documents import PointCloud3D
from docarray.typing import AnyEmbedding
from typing import Optional


# extend it
class MyPointCloud3D(PointCloud3D):
    second_embedding: Optional[AnyEmbedding] = None


pc = MyPointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
pc.tensors = pc.url.load(samples=100)
# model = MyEmbeddingModel()
# pc.embedding = model(pc.tensors.points)
# pc.second_embedding = model(pc.tensors.colors)

You can use this Document for composition:

from docarray import BaseDoc
from docarray.documents import PointCloud3D, TextDoc


# compose it
class MultiModalDoc(BaseDoc):
    point_cloud: PointCloud3D
    text: TextDoc


mmdoc = MultiModalDoc(
    point_cloud=PointCloud3D(
        url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj'
    ),
    text=TextDoc(text='hello world, how are you doing?'),
)
mmdoc.point_cloud.tensors = mmdoc.point_cloud.url.load(samples=100)

# or
mmdoc.point_cloud.bytes_ = mmdoc.point_cloud.url.load_bytes()

You can display your point cloud from either its url, or its tensors:

from docarray.documents import PointCloud3D

# display from url
pc = PointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
# pc.url.display()

# display from tensors
pc.tensors = pc.url.load(samples=10000)
# pc.tensors.display()
Source code in docarray/documents/point_cloud/point_cloud_3d.py
class PointCloud3D(BaseDoc):
    """
    Document for handling point clouds for 3D data representation.

    Point cloud is a representation of a 3D mesh. It is made by repeatedly and uniformly
    sampling points within the surface of the 3D body. Compared to the mesh
    representation, the point cloud is a fixed size ndarray of shape `(n_samples, 3)` and
    hence easier for deep learning algorithms to handle.

    A PointCloud3D Document can contain:

    - a [`PointCloud3DUrl`][docarray.typing.url.PointCloud3DUrl] (`PointCloud3D.url`)
    - a [`PointsAndColors`][docarray.documents.point_cloud.points_and_colors.PointsAndColors] object (`PointCloud3D.tensors`)
    - an [`AnyEmbedding`](../../../../api_references/typing/tensor/embedding) (`PointCloud3D.embedding`)
    - a `bytes` object (`PointCloud3D.bytes_`)

    You can use this Document directly:

    ```python
    from docarray.documents import PointCloud3D

    # use it directly
    pc = PointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
    pc.tensors = pc.url.load(samples=100)
    # model = MyEmbeddingModel()
    # pc.embedding = model(pc.tensors.points)
    ```

    You can extend this Document:

    ```python
    from docarray.documents import PointCloud3D
    from docarray.typing import AnyEmbedding
    from typing import Optional


    # extend it
    class MyPointCloud3D(PointCloud3D):
        second_embedding: Optional[AnyEmbedding] = None


    pc = MyPointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
    pc.tensors = pc.url.load(samples=100)
    # model = MyEmbeddingModel()
    # pc.embedding = model(pc.tensors.points)
    # pc.second_embedding = model(pc.tensors.colors)
    ```

    You can use this Document for composition:

    ```python
    from docarray import BaseDoc
    from docarray.documents import PointCloud3D, TextDoc


    # compose it
    class MultiModalDoc(BaseDoc):
        point_cloud: PointCloud3D
        text: TextDoc


    mmdoc = MultiModalDoc(
        point_cloud=PointCloud3D(
            url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj'
        ),
        text=TextDoc(text='hello world, how are you doing?'),
    )
    mmdoc.point_cloud.tensors = mmdoc.point_cloud.url.load(samples=100)

    # or
    mmdoc.point_cloud.bytes_ = mmdoc.point_cloud.url.load_bytes()
    ```

    You can display your point cloud from either its url, or its tensors:

    ```python
    from docarray.documents import PointCloud3D

    # display from url
    pc = PointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
    # pc.url.display()

    # display from tensors
    pc.tensors = pc.url.load(samples=10000)
    # pc.tensors.display()
    ```
    """

    url: Optional[PointCloud3DUrl] = Field(
        description='URL to a file containing point cloud information. Can be remote (web) URL, or a local file path.',
        example='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj',
        default=None,
    )
    tensors: Optional[PointsAndColors] = Field(
        description='A tensor object of 3D point cloud of type `PointsAndColors`.',
        example=[[0, 0, 1], [1, 0, 1], [0, 1, 1]],
        default=None,
    )
    embedding: Optional[AnyEmbedding] = Field(
        description='Store an embedding: a vector representation of 3D point cloud.',
        example=[1, 1, 1],
        default=None,
    )
    bytes_: Optional[bytes] = Field(
        description='Bytes representation of 3D point cloud.',
        default=None,
    )

    @classmethod
    def _validate(self, value: Union[str, AbstractTensor, Any]) -> Any:
        if isinstance(value, str):
            value = {'url': value}
        elif isinstance(value, (AbstractTensor, np.ndarray)) or (
            torch is not None
            and isinstance(value, torch.Tensor)
            or (tf is not None and isinstance(value, tf.Tensor))
        ):
            value = {'tensors': PointsAndColors(points=value)}

        return value

    if is_pydantic_v2:

        @model_validator(mode='before')
        @classmethod
        def validate_model_before(cls, value):
            return cls._validate(value)

    else:

        @classmethod
        def validate(
            cls: Type[T],
            value: Union[str, AbstractTensor, Any],
        ) -> T:
            return super().validate(cls._validate(value))

dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data

from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)

from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )

from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)

from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)

json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)

parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )

schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)

summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()

to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')

to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)

to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)

update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

PointsAndColors

Bases: BaseDoc

Document for handling the tensor data of a PointCloud3D object.

A PointsAndColors Document can contain:

  • an AnyTensor containing the points in 3D space information (PointsAndColors.points)
  • an AnyTensor containing the points' color information (PointsAndColors.colors)
Source code in docarray/documents/point_cloud/points_and_colors.py
class PointsAndColors(BaseDoc):
    """
    Document for handling the tensor data of a [`PointCloud3D`][docarray.documents.point_cloud.PointCloud3D] object.

    A PointsAndColors Document can contain:

    - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor)
    containing the points in 3D space information (`PointsAndColors.points`)
    - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor)
    containing the points' color information (`PointsAndColors.colors`)
    """

    points: AnyTensor
    colors: Optional[AnyTensor] = None

    @classmethod
    def validate(
        cls: Type[T],
        value: Union[str, AbstractTensor, Any],
    ) -> T:
        if isinstance(value, (AbstractTensor, np.ndarray)) or (
            torch is not None
            and isinstance(value, torch.Tensor)
            or (tf is not None and isinstance(value, tf.Tensor))
        ):
            value = cls(points=value)

        return super().validate(value)

    def display(self) -> None:
        """
        Plot point cloud consisting of points in 3D space and optionally colors.
        """
        if TYPE_CHECKING:
            import trimesh
        else:
            trimesh = import_library('trimesh', raise_error=True)
        from IPython.display import display

        colors = (
            self.colors
            if self.colors is not None
            else np.tile(
                np.array([0, 0, 0]),
                (self.points.get_comp_backend().shape(self.points)[0], 1),
            )
        )
        pc = trimesh.points.PointCloud(vertices=self.points, colors=colors)

        s = trimesh.Scene(geometry=pc)
        display(s.show())

dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data

display()

Plot point cloud consisting of points in 3D space and optionally colors.

Source code in docarray/documents/point_cloud/points_and_colors.py
def display(self) -> None:
    """
    Plot point cloud consisting of points in 3D space and optionally colors.
    """
    if TYPE_CHECKING:
        import trimesh
    else:
        trimesh = import_library('trimesh', raise_error=True)
    from IPython.display import display

    colors = (
        self.colors
        if self.colors is not None
        else np.tile(
            np.array([0, 0, 0]),
            (self.points.get_comp_backend().shape(self.points)[0], 1),
        )
    )
    pc = trimesh.points.PointCloud(vertices=self.points, colors=colors)

    s = trimesh.Scene(geometry=pc)
    display(s.show())

from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)

from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )

from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)

from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)

json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude<