Skip to content

Documents

docarray.documents

AudioDoc

Bases: BaseDoc

Document for handling audios.

The Audio Document can contain:

You can use this Document directly:

from docarray.documents import AudioDoc

# use it directly
audio = AudioDoc(
    url='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.wav?raw=true'
)
audio.tensor, audio.frame_rate = audio.url.load()
# model = MyEmbeddingModel()
# audio.embedding = model(audio.tensor)

You can extend this Document:

from docarray.documents import AudioDoc, TextDoc
from typing import Optional


# extend it
class MyAudio(AudioDoc):
    name: Optional[TextDoc] = None


audio = MyAudio(
    url='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.wav?raw=true'
)
audio.name = TextDoc(text='my first audio')
audio.tensor, audio.frame_rate = audio.url.load()
# model = MyEmbeddingModel()
# audio.embedding = model(audio.tensor)

You can use this Document for composition:

from docarray import BaseDoc
from docarray.documents import AudioDoc, TextDoc


# compose it
class MultiModalDoc(BaseDoc):
    audio: AudioDoc
    text: TextDoc


mmdoc = MultiModalDoc(
    audio=AudioDoc(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.wav?raw=true'
    ),
    text=TextDoc(text='hello world, how are you doing?'),
)
mmdoc.audio.tensor, mmdoc.audio.frame_rate = mmdoc.audio.url.load()

# equivalent to
mmdoc.audio.bytes_ = mmdoc.audio.url.load_bytes()
mmdoc.audio.tensor, mmdoc.audio.frame_rate = mmdoc.audio.bytes_.load()
Source code in docarray/documents/audio.py
class AudioDoc(BaseDoc):
    """
    Document for handling audios.

    The Audio Document can contain:

    - an [`AudioUrl`][docarray.typing.url.AudioUrl] (`AudioDoc.url`)
    - an [`AudioTensor`](../../../api_references/typing/tensor/audio) (`AudioDoc.tensor`)
    - an [`AnyEmbedding`](../../../api_references/typing/tensor/embedding) (`AudioDoc.embedding`)
    - an [`AudioBytes`][docarray.typing.bytes.AudioBytes] (`AudioDoc.bytes_`) object
    - an integer representing the frame_rate (`AudioDoc.frame_rate`)

    You can use this Document directly:

    ```python
    from docarray.documents import AudioDoc

    # use it directly
    audio = AudioDoc(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.wav?raw=true'
    )
    audio.tensor, audio.frame_rate = audio.url.load()
    # model = MyEmbeddingModel()
    # audio.embedding = model(audio.tensor)
    ```

    You can extend this Document:

    ```python
    from docarray.documents import AudioDoc, TextDoc
    from typing import Optional


    # extend it
    class MyAudio(AudioDoc):
        name: Optional[TextDoc] = None


    audio = MyAudio(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.wav?raw=true'
    )
    audio.name = TextDoc(text='my first audio')
    audio.tensor, audio.frame_rate = audio.url.load()
    # model = MyEmbeddingModel()
    # audio.embedding = model(audio.tensor)
    ```

    You can use this Document for composition:

    ```python
    from docarray import BaseDoc
    from docarray.documents import AudioDoc, TextDoc


    # compose it
    class MultiModalDoc(BaseDoc):
        audio: AudioDoc
        text: TextDoc


    mmdoc = MultiModalDoc(
        audio=AudioDoc(
            url='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.wav?raw=true'
        ),
        text=TextDoc(text='hello world, how are you doing?'),
    )
    mmdoc.audio.tensor, mmdoc.audio.frame_rate = mmdoc.audio.url.load()

    # equivalent to
    mmdoc.audio.bytes_ = mmdoc.audio.url.load_bytes()
    mmdoc.audio.tensor, mmdoc.audio.frame_rate = mmdoc.audio.bytes_.load()
    ```
    """

    url: Optional[AudioUrl] = Field(
        description='The url to a (potentially remote) audio file that can be loaded',
        example='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.mp3?raw=true',
        default=None,
    )
    tensor: Optional[AudioTensor] = Field(
        description='Tensor object of the audio which can be specified to one of `AudioNdArray`, `AudioTorchTensor`, `AudioTensorFlowTensor`',
        default=None,
    )
    embedding: Optional[AnyEmbedding] = Field(
        description='Store an embedding: a vector representation of the audio.',
        example=[0, 1, 0],
        default=None,
    )
    bytes_: Optional[AudioBytes] = Field(
        description='Bytes representation pf the audio',
        default=None,
    )
    frame_rate: Optional[int] = Field(
        description='An integer representing the frame rate of the audio.',
        example=24,
        default=None,
    )

    @classmethod
    def _validate(cls, value) -> Dict[str, Any]:
        if isinstance(value, str):
            value = dict(url=value)
        elif isinstance(value, (AbstractTensor, np.ndarray)) or (
            torch is not None
            and isinstance(value, torch.Tensor)
            or (tf is not None and isinstance(value, tf.Tensor))
        ):
            value = dict(tensor=value)

        return value

    if is_pydantic_v2:

        @model_validator(mode='before')
        @classmethod
        def validate_model_before(cls, value):
            return cls._validate(value)

    else:

        @classmethod
        def validate(
            cls: Type[T],
            value: Union[str, AbstractTensor, Any],
        ) -> T:
            return super().validate(cls._validate(value))

dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data

from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)

from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )

from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)

from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)

json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)

parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )

schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)

summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()

to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')

to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)

to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)

update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

ImageDoc

Bases: BaseDoc

Document for handling images.

It can contain:

You can use this Document directly:

from docarray.documents import ImageDoc

# use it directly
image = ImageDoc(
    url='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true'
)
image.tensor = image.url.load()
# model = MyEmbeddingModel()
# image.embedding = model(image.tensor)

You can extend this Document:

from docarray.documents import ImageDoc
from docarray.typing import AnyEmbedding
from typing import Optional


# extend it
class MyImage(ImageDoc):
    second_embedding: Optional[AnyEmbedding] = None


image = MyImage(
    url='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true'
)
image.tensor = image.url.load()
# model = MyEmbeddingModel()
# image.embedding = model(image.tensor)
# image.second_embedding = model(image.tensor)

You can use this Document for composition:

from docarray import BaseDoc
from docarray.documents import ImageDoc, TextDoc


# compose it
class MultiModalDoc(BaseDoc):
    image: ImageDoc
    text: TextDoc


mmdoc = MultiModalDoc(
    image=ImageDoc(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true'
    ),
    text=TextDoc(text='hello world, how are you doing?'),
)
mmdoc.image.tensor = mmdoc.image.url.load()

# or
mmdoc.image.bytes_ = mmdoc.image.url.load_bytes()
mmdoc.image.tensor = mmdoc.image.bytes_.load()
Source code in docarray/documents/image.py
class ImageDoc(BaseDoc):
    """
    Document for handling images.

    It can contain:

    - an [`ImageUrl`][docarray.typing.url.ImageUrl] (`Image.url`)
    - an [`ImageTensor`](../../../api_references/typing/tensor/image) (`Image.tensor`)
    - an [`AnyEmbedding`](../../../api_references/typing/tensor/embedding) (`Image.embedding`)
    - an [`ImageBytes`][docarray.typing.bytes.ImageBytes] object (`ImageDoc.bytes_`)

    You can use this Document directly:

    ```python
    from docarray.documents import ImageDoc

    # use it directly
    image = ImageDoc(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true'
    )
    image.tensor = image.url.load()
    # model = MyEmbeddingModel()
    # image.embedding = model(image.tensor)
    ```

    You can extend this Document:

    ```python
    from docarray.documents import ImageDoc
    from docarray.typing import AnyEmbedding
    from typing import Optional


    # extend it
    class MyImage(ImageDoc):
        second_embedding: Optional[AnyEmbedding] = None


    image = MyImage(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true'
    )
    image.tensor = image.url.load()
    # model = MyEmbeddingModel()
    # image.embedding = model(image.tensor)
    # image.second_embedding = model(image.tensor)
    ```

    You can use this Document for composition:

    ```python
    from docarray import BaseDoc
    from docarray.documents import ImageDoc, TextDoc


    # compose it
    class MultiModalDoc(BaseDoc):
        image: ImageDoc
        text: TextDoc


    mmdoc = MultiModalDoc(
        image=ImageDoc(
            url='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true'
        ),
        text=TextDoc(text='hello world, how are you doing?'),
    )
    mmdoc.image.tensor = mmdoc.image.url.load()

    # or
    mmdoc.image.bytes_ = mmdoc.image.url.load_bytes()
    mmdoc.image.tensor = mmdoc.image.bytes_.load()
    ```
    """

    url: Optional[ImageUrl] = Field(
        description='URL to a (potentially remote) image file that needs to be loaded',
        example='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true',
        default=None,
    )
    tensor: Optional[ImageTensor] = Field(
        description='Tensor object of the image which can be specifed to one of `ImageNdArray`, `ImageTorchTensor`, `ImageTensorflowTensor`.',
        default=None,
    )
    embedding: Optional[AnyEmbedding] = Field(
        description='Store an embedding: a vector representation of the image.',
        example=[1, 0, 1],
        default=None,
    )
    bytes_: Optional[ImageBytes] = Field(
        description='Bytes object of the image which is an instance of `ImageBytes`.',
        default=None,
    )

    @classmethod
    def _validate(cls, value) -> Dict[str, Any]:
        if isinstance(value, str):
            value = dict(url=value)
        elif (
            isinstance(value, (AbstractTensor, np.ndarray))
            or (torch is not None and isinstance(value, torch.Tensor))
            or (tf is not None and isinstance(value, tf.Tensor))
        ):
            value = dict(tensor=value)
        elif isinstance(value, bytes):
            value = dict(byte=value)

        return value

    if is_pydantic_v2:

        @model_validator(mode='before')
        @classmethod
        def validate_model_before(cls, value):
            return cls._validate(value)

    else:

        @classmethod
        def validate(
            cls: Type[T],
            value: Union[str, AbstractTensor, Any],
        ) -> T:
            return super().validate(cls._validate(value))

dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data

from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)

from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )

from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)

from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)

json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)

parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )

schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)

summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()

to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')

to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)

to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)

update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

Mesh3D

Bases: BaseDoc

Document for handling meshes for 3D data representation.

A mesh is a representation for 3D data and contains vertices and faces information. Vertices are points in a 3D space, represented as a tensor of shape (n_points, 3). Faces are triangular surfaces that can be defined by three points in 3D space, corresponding to the three vertices of a triangle. Faces can be represented as a tensor of shape (n_faces, 3). Each number in that tensor refers to an index of a vertex in the tensor of vertices.

The Mesh3D Document can contain:

You can use this Document directly:

from docarray.documents import Mesh3D

# use it directly
mesh = Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
mesh.tensors = mesh.url.load()
# model = MyEmbeddingModel()
# mesh.embedding = model(mesh.tensors.vertices)

You can extend this Document:

from docarray.documents import Mesh3D
from docarray.typing import AnyEmbedding
from typing import Optional


# extend it
class MyMesh3D(Mesh3D):
    name: Optional[str] = None


mesh = MyMesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
mesh.name = 'my first mesh'
mesh.tensors = mesh.url.load()
# model = MyEmbeddingModel()
# mesh.embedding = model(mesh.vertices)

You can use this Document for composition:

from docarray import BaseDoc
from docarray.documents import Mesh3D, TextDoc


# compose it
class MultiModalDoc(BaseDoc):
    mesh: Mesh3D
    text: TextDoc


mmdoc = MultiModalDoc(
    mesh=Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj'),
    text=TextDoc(text='hello world, how are you doing?'),
)
mmdoc.mesh.tensors = mmdoc.mesh.url.load()

# or
mmdoc.mesh.bytes_ = mmdoc.mesh.url.load_bytes()

You can display your 3D mesh in a notebook from either its url, or its tensors:

from docarray.documents import Mesh3D

# display from url
mesh = Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
# mesh.url.display()

# display from tensors
mesh.tensors = mesh.url.load()
# mesh.tensors.display()
Source code in docarray/documents/mesh/mesh_3d.py
class Mesh3D(BaseDoc):
    """
    Document for handling meshes for 3D data representation.

    A mesh is a representation for 3D data and contains vertices and faces information.
    Vertices are points in a 3D space, represented as a tensor of shape (n_points, 3).
    Faces are triangular surfaces that can be defined by three points in 3D space,
    corresponding to the three vertices of a triangle. Faces can be represented as a
    tensor of shape (n_faces, 3). Each number in that tensor refers to an index of a
    vertex in the tensor of vertices.

    The Mesh3D Document can contain:

    - an [`Mesh3DUrl`][docarray.typing.url.Mesh3DUrl] (`Mesh3D.url`)
    - a [`VerticesAndFaces`][docarray.documents.mesh.vertices_and_faces.VerticesAndFaces]
    object containing:

        - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor) of
        vertices (`Mesh3D.tensors.vertices`)
        - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor) of faces (`Mesh3D.tensors.faces`)

    - an [`AnyEmbedding`](../../../../api_references/typing/tensor/embedding) (`Mesh3D.embedding`)
    - a `bytes` object (`Mesh3D.bytes_`).

    You can use this Document directly:

    ```python
    from docarray.documents import Mesh3D

    # use it directly
    mesh = Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
    mesh.tensors = mesh.url.load()
    # model = MyEmbeddingModel()
    # mesh.embedding = model(mesh.tensors.vertices)
    ```

    You can extend this Document:

    ```python
    from docarray.documents import Mesh3D
    from docarray.typing import AnyEmbedding
    from typing import Optional


    # extend it
    class MyMesh3D(Mesh3D):
        name: Optional[str] = None


    mesh = MyMesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
    mesh.name = 'my first mesh'
    mesh.tensors = mesh.url.load()
    # model = MyEmbeddingModel()
    # mesh.embedding = model(mesh.vertices)
    ```

    You can use this Document for composition:

    ```python
    from docarray import BaseDoc
    from docarray.documents import Mesh3D, TextDoc


    # compose it
    class MultiModalDoc(BaseDoc):
        mesh: Mesh3D
        text: TextDoc


    mmdoc = MultiModalDoc(
        mesh=Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj'),
        text=TextDoc(text='hello world, how are you doing?'),
    )
    mmdoc.mesh.tensors = mmdoc.mesh.url.load()

    # or
    mmdoc.mesh.bytes_ = mmdoc.mesh.url.load_bytes()
    ```

    You can display your 3D mesh in a notebook from either its url, or its tensors:

    ```python
    from docarray.documents import Mesh3D

    # display from url
    mesh = Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
    # mesh.url.display()

    # display from tensors
    mesh.tensors = mesh.url.load()
    # mesh.tensors.display()
    ```

    """

    url: Optional[Mesh3DUrl] = Field(
        description='URL to a file containing 3D mesh information. Can be remote (web) URL, or a local file path.',
        example='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj',
        default=None,
    )
    tensors: Optional[VerticesAndFaces] = Field(
        description='A tensor object of 3D mesh of type `VerticesAndFaces`.',
        example=[[0, 1, 1], [1, 0, 1], [1, 1, 0]],
        default=None,
    )
    embedding: Optional[AnyEmbedding] = Field(
        description='Store an embedding: a vector representation of the 3D mesh.',
        default=[1, 0, 1],
    )
    bytes_: Optional[bytes] = Field(
        description='Bytes representation of 3D mesh.',
        default=None,
    )

    if is_pydantic_v2:

        @model_validator(mode='before')
        @classmethod
        def validate_model_before(cls, value):
            if isinstance(value, str):
                return {'url': value}
            return value

    else:

        @classmethod
        def validate(
            cls: Type[T],
            value: Union[str, Any],
        ) -> T:
            if isinstance(value, str):
                value = cls(url=value)
            return super().validate(value)

dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data

from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)

from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )

from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)

from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)

json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)

parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )

schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)

summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()

to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')

to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)

to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)

update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

PointCloud3D

Bases: BaseDoc

Document for handling point clouds for 3D data representation.

Point cloud is a representation of a 3D mesh. It is made by repeatedly and uniformly sampling points within the surface of the 3D body. Compared to the mesh representation, the point cloud is a fixed size ndarray of shape (n_samples, 3) and hence easier for deep learning algorithms to handle.

A PointCloud3D Document can contain:

You can use this Document directly:

from docarray.documents import PointCloud3D

# use it directly
pc = PointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
pc.tensors = pc.url.load(samples=100)
# model = MyEmbeddingModel()
# pc.embedding = model(pc.tensors.points)

You can extend this Document:

from docarray.documents import PointCloud3D
from docarray.typing import AnyEmbedding
from typing import Optional


# extend it
class MyPointCloud3D(PointCloud3D):
    second_embedding: Optional[AnyEmbedding] = None


pc = MyPointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
pc.tensors = pc.url.load(samples=100)
# model = MyEmbeddingModel()
# pc.embedding = model(pc.tensors.points)
# pc.second_embedding = model(pc.tensors.colors)

You can use this Document for composition:

from docarray import BaseDoc
from docarray.documents import PointCloud3D, TextDoc


# compose it
class MultiModalDoc(BaseDoc):
    point_cloud: PointCloud3D
    text: TextDoc


mmdoc = MultiModalDoc(
    point_cloud=PointCloud3D(
        url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj'
    ),
    text=TextDoc(text='hello world, how are you doing?'),
)
mmdoc.point_cloud.tensors = mmdoc.point_cloud.url.load(samples=100)

# or
mmdoc.point_cloud.bytes_ = mmdoc.point_cloud.url.load_bytes()

You can display your point cloud from either its url, or its tensors:

from docarray.documents import PointCloud3D

# display from url
pc = PointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
# pc.url.display()

# display from tensors
pc.tensors = pc.url.load(samples=10000)
# pc.tensors.display()
Source code in docarray/documents/point_cloud/point_cloud_3d.py
class PointCloud3D(BaseDoc):
    """
    Document for handling point clouds for 3D data representation.

    Point cloud is a representation of a 3D mesh. It is made by repeatedly and uniformly
    sampling points within the surface of the 3D body. Compared to the mesh
    representation, the point cloud is a fixed size ndarray of shape `(n_samples, 3)` and
    hence easier for deep learning algorithms to handle.

    A PointCloud3D Document can contain:

    - a [`PointCloud3DUrl`][docarray.typing.url.PointCloud3DUrl] (`PointCloud3D.url`)
    - a [`PointsAndColors`][docarray.documents.point_cloud.points_and_colors.PointsAndColors] object (`PointCloud3D.tensors`)
    - an [`AnyEmbedding`](../../../../api_references/typing/tensor/embedding) (`PointCloud3D.embedding`)
    - a `bytes` object (`PointCloud3D.bytes_`)

    You can use this Document directly:

    ```python
    from docarray.documents import PointCloud3D

    # use it directly
    pc = PointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
    pc.tensors = pc.url.load(samples=100)
    # model = MyEmbeddingModel()
    # pc.embedding = model(pc.tensors.points)
    ```

    You can extend this Document:

    ```python
    from docarray.documents import PointCloud3D
    from docarray.typing import AnyEmbedding
    from typing import Optional


    # extend it
    class MyPointCloud3D(PointCloud3D):
        second_embedding: Optional[AnyEmbedding] = None


    pc = MyPointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
    pc.tensors = pc.url.load(samples=100)
    # model = MyEmbeddingModel()
    # pc.embedding = model(pc.tensors.points)
    # pc.second_embedding = model(pc.tensors.colors)
    ```

    You can use this Document for composition:

    ```python
    from docarray import BaseDoc
    from docarray.documents import PointCloud3D, TextDoc


    # compose it
    class MultiModalDoc(BaseDoc):
        point_cloud: PointCloud3D
        text: TextDoc


    mmdoc = MultiModalDoc(
        point_cloud=PointCloud3D(
            url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj'
        ),
        text=TextDoc(text='hello world, how are you doing?'),
    )
    mmdoc.point_cloud.tensors = mmdoc.point_cloud.url.load(samples=100)

    # or
    mmdoc.point_cloud.bytes_ = mmdoc.point_cloud.url.load_bytes()
    ```

    You can display your point cloud from either its url, or its tensors:

    ```python
    from docarray.documents import PointCloud3D

    # display from url
    pc = PointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
    # pc.url.display()

    # display from tensors
    pc.tensors = pc.url.load(samples=10000)
    # pc.tensors.display()
    ```
    """

    url: Optional[PointCloud3DUrl] = Field(
        description='URL to a file containing point cloud information. Can be remote (web) URL, or a local file path.',
        example='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj',
        default=None,
    )
    tensors: Optional[PointsAndColors] = Field(
        description='A tensor object of 3D point cloud of type `PointsAndColors`.',
        example=[[0, 0, 1], [1, 0, 1], [0, 1, 1]],
        default=None,
    )
    embedding: Optional[AnyEmbedding] = Field(
        description='Store an embedding: a vector representation of 3D point cloud.',
        example=[1, 1, 1],
        default=None,
    )
    bytes_: Optional[bytes] = Field(
        description='Bytes representation of 3D point cloud.',
        default=None,
    )

    @classmethod
    def _validate(self, value: Union[str, AbstractTensor, Any]) -> Any:
        if isinstance(value, str):
            value = {'url': value}
        elif isinstance(value, (AbstractTensor, np.ndarray)) or (
            torch is not None
            and isinstance(value, torch.Tensor)
            or (tf is not None and isinstance(value, tf.Tensor))
        ):
            value = {'tensors': PointsAndColors(points=value)}

        return value

    if is_pydantic_v2:

        @model_validator(mode='before')
        @classmethod
        def validate_model_before(cls, value):
            return cls._validate(value)

    else:

        @classmethod
        def validate(
            cls: Type[T],
            value: Union[str, AbstractTensor, Any],
        ) -> T:
            return super().validate(cls._validate(value))

dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data

from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)

from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )

from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)

from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)

json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)

parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )

schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)

summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()

to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')

to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)

to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)

update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

PointsAndColors

Bases: BaseDoc

Document for handling the tensor data of a PointCloud3D object.

A PointsAndColors Document can contain:

  • an AnyTensor containing the points in 3D space information (PointsAndColors.points)
  • an AnyTensor containing the points' color information (PointsAndColors.colors)
Source code in docarray/documents/point_cloud/points_and_colors.py
class PointsAndColors(BaseDoc):
    """
    Document for handling the tensor data of a [`PointCloud3D`][docarray.documents.point_cloud.PointCloud3D] object.

    A PointsAndColors Document can contain:

    - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor)
    containing the points in 3D space information (`PointsAndColors.points`)
    - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor)
    containing the points' color information (`PointsAndColors.colors`)
    """

    points: AnyTensor
    colors: Optional[AnyTensor] = None

    @classmethod
    def validate(
        cls: Type[T],
        value: Union[str, AbstractTensor, Any],
    ) -> T:
        if isinstance(value, (AbstractTensor, np.ndarray)) or (
            torch is not None
            and isinstance(value, torch.Tensor)
            or (tf is not None and isinstance(value, tf.Tensor))
        ):
            value = cls(points=value)

        return super().validate(value)

    def display(self) -> None:
        """
        Plot point cloud consisting of points in 3D space and optionally colors.
        """
        if TYPE_CHECKING:
            import trimesh
        else:
            trimesh = import_library('trimesh', raise_error=True)
        from IPython.display import display

        colors = (
            self.colors
            if self.colors is not None
            else np.tile(
                np.array([0, 0, 0]),
                (self.points.get_comp_backend().shape(self.points)[0], 1),
            )
        )
        pc = trimesh.points.PointCloud(vertices=self.points, colors=colors)

        s = trimesh.Scene(geometry=pc)
        display(s.show())

dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data

display()

Plot point cloud consisting of points in 3D space and optionally colors.

Source code in docarray/documents/point_cloud/points_and_colors.py
def display(self) -> None:
    """
    Plot point cloud consisting of points in 3D space and optionally colors.
    """
    if TYPE_CHECKING:
        import trimesh
    else:
        trimesh = import_library('trimesh', raise_error=True)
    from IPython.display import display

    colors = (
        self.colors
        if self.colors is not None
        else np.tile(
            np.array([0, 0, 0]),
            (self.points.get_comp_backend().shape(self.points)[0], 1),
        )
    )
    pc = trimesh.points.PointCloud(vertices=self.points, colors=colors)

    s = trimesh.Scene(geometry=pc)
    display(s.show())

from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)

from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )

from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)

from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)

json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)

parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )

schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)

summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()

to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')

to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)

to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)

update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

TextDoc

Bases: BaseDoc

Document for handling text.

It can contain:

  • a TextUrl (TextDoc.url)
  • a str (TextDoc.text)
  • an AnyEmbedding (TextDoc.embedding)
  • a bytes object (TextDoc.bytes_)

You can use this Document directly:

from docarray.documents import TextDoc

# use it directly
txt_doc = TextDoc(url='https://www.gutenberg.org/files/1065/1065-0.txt')
txt_doc.text = txt_doc.url.load()
# model = MyEmbeddingModel()
# txt_doc.embedding = model(txt_doc.text)

You can initialize directly from a string:

from docarray.documents import TextDoc

txt_doc = TextDoc('hello world')

You can extend this Document:

from docarray.documents import TextDoc
from docarray.typing import AnyEmbedding
from typing import Optional


# extend it
class MyText(TextDoc):
    second_embedding: Optional[AnyEmbedding] = None


txt_doc = MyText(url='https://www.gutenberg.org/files/1065/1065-0.txt')
txt_doc.text = txt_doc.url.load()
# model = MyEmbeddingModel()
# txt_doc.embedding = model(txt_doc.text)
# txt_doc.second_embedding = model(txt_doc.text)

You can use this Document for composition:

from docarray import BaseDoc
from docarray.documents import ImageDoc, TextDoc


# compose it
class MultiModalDoc(BaseDoc):
    image_doc: ImageDoc
    text_doc: TextDoc


mmdoc = MultiModalDoc(
    image_doc=ImageDoc(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true'
    ),
    text_doc=TextDoc(text='hello world, how are you doing?'),
)
mmdoc.image_doc.tensor = mmdoc.image_doc.url.load()

# or
mmdoc.image_doc.bytes_ = mmdoc.image_doc.url.load_bytes()
mmdoc.image_doc.tensor = mmdoc.image_doc.bytes_.load()

This Document can be compared against another Document of the same type or a string. When compared against another object of the same type, the pydantic BaseModel equality check will apply which checks the equality of every attribute, excluding id. When compared against a str, it will check the equality of the text attribute against the given string.

from docarray.documents import TextDoc

doc = TextDoc(text='This is the main text', url='exampleurl.com/file')
doc2 = TextDoc(text='This is the main text', url='exampleurl.com/file')

doc == 'This is the main text'  # True
doc == doc2  # True
Source code in docarray/documents/text.py
class TextDoc(BaseDoc):
    """
    Document for handling text.

    It can contain:

    - a [`TextUrl`][docarray.typing.url.TextUrl] (`TextDoc.url`)
    - a `str` (`TextDoc.text`)
    - an [`AnyEmbedding`](../../../api_references/typing/tensor/embedding) (`TextDoc.embedding`)
    - a `bytes` object (`TextDoc.bytes_`)

    You can use this Document directly:

    ```python
    from docarray.documents import TextDoc

    # use it directly
    txt_doc = TextDoc(url='https://www.gutenberg.org/files/1065/1065-0.txt')
    txt_doc.text = txt_doc.url.load()
    # model = MyEmbeddingModel()
    # txt_doc.embedding = model(txt_doc.text)
    ```

    You can initialize directly from a string:

    ```python
    from docarray.documents import TextDoc

    txt_doc = TextDoc('hello world')
    ```

    You can extend this Document:

    ```python
    from docarray.documents import TextDoc
    from docarray.typing import AnyEmbedding
    from typing import Optional


    # extend it
    class MyText(TextDoc):
        second_embedding: Optional[AnyEmbedding] = None


    txt_doc = MyText(url='https://www.gutenberg.org/files/1065/1065-0.txt')
    txt_doc.text = txt_doc.url.load()
    # model = MyEmbeddingModel()
    # txt_doc.embedding = model(txt_doc.text)
    # txt_doc.second_embedding = model(txt_doc.text)
    ```

    You can use this Document for composition:

    ```python
    from docarray import BaseDoc
    from docarray.documents import ImageDoc, TextDoc


    # compose it
    class MultiModalDoc(BaseDoc):
        image_doc: ImageDoc
        text_doc: TextDoc


    mmdoc = MultiModalDoc(
        image_doc=ImageDoc(
            url='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true'
        ),
        text_doc=TextDoc(text='hello world, how are you doing?'),
    )
    mmdoc.image_doc.tensor = mmdoc.image_doc.url.load()

    # or
    mmdoc.image_doc.bytes_ = mmdoc.image_doc.url.load_bytes()
    mmdoc.image_doc.tensor = mmdoc.image_doc.bytes_.load()
    ```

    This Document can be compared against another Document of the same type or a string.
    When compared against another object of the same type, the pydantic BaseModel
    equality check will apply which checks the equality of every attribute,
    excluding `id`. When compared against a str, it will check the equality
    of the `text` attribute against the given string.

    ```python
    from docarray.documents import TextDoc

    doc = TextDoc(text='This is the main text', url='exampleurl.com/file')
    doc2 = TextDoc(text='This is the main text', url='exampleurl.com/file')

    doc == 'This is the main text'  # True
    doc == doc2  # True
    ```

    """

    text: Optional[str] = Field(
        description='The text content stored in the document',
        example='This is an example text content of the document',
        default=None,
    )
    url: Optional[TextUrl] = Field(
        description='URL to a (potentially remote) text file that can be loaded',
        example='https://www.w3.org/History/19921103-hypertext/hypertext/README.html',
        default=None,
    )
    embedding: Optional[AnyEmbedding] = Field(
        description='Store an embedding: a vector representation of the text',
        example=[1, 0, 1],
        default=None,
    )
    bytes_: Optional[bytes] = Field(
        description='Bytes representation of the text',
        default=None,
    )

    def __init__(self, text: Optional[str] = None, **kwargs):
        if 'text' not in kwargs:
            kwargs['text'] = text
        super().__init__(**kwargs)

    if is_pydantic_v2:

        @model_validator(mode='before')
        @classmethod
        def validate_model_before(cls, values):
            if isinstance(values, str):
                return {'text': values}
            else:
                return values

    else:

        @classmethod
        def validate(
            cls: Type[T],
            value: Union[str, Any],
        ) -> T:
            if isinstance(value, str):
                value = cls(text=value)
            return super().validate(value)

    def __eq__(self, other: Any) -> bool:
        if isinstance(other, str):
            return self.text == other
        else:
            # BaseModel has a default equality
            return super().__eq__(other)

    def __contains__(self, item: str) -> bool:
        """
        This method makes `TextDoc` behave the same as an `str`.

        :param item: A string to be checked if is a substring of `text` attribute
        :return: A boolean determining the presence of `item` as a substring in `text`

        ```python
        from docarray.documents import TextDoc

        t = TextDoc(text='this is my text document')
        assert 'text' in t
        assert 'docarray' not in t
        ```
        """
        if self.text is not None:
            return self.text.__contains__(item)
        else:
            return False

    def _get_string_for_regex_filter(self):
        return self.text

__contains__(item)

This method makes TextDoc behave the same as an str.

Parameters:

Name Type Description Default
item str

A string to be checked if is a substring of text attribute

required

Returns:

Type Description
bool

A boolean determining the presence of item as a substring in text python from docarray.documents import TextDoc t = TextDoc(text='this is my text document') assert 'text' in t assert 'docarray' not in t

Source code in docarray/documents/text.py
def __contains__(self, item: str) -> bool:
    """
    This method makes `TextDoc` behave the same as an `str`.

    :param item: A string to be checked if is a substring of `text` attribute
    :return: A boolean determining the presence of `item` as a substring in `text`

    ```python
    from docarray.documents import TextDoc

    t = TextDoc(text='this is my text document')
    assert 'text' in t
    assert 'docarray' not in t
    ```
    """
    if self.text is not None:
        return self.text.__contains__(item)
    else:
        return False

dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data

from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)

from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )

from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)

from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)

json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)

parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )

schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)

summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()

to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')

to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)

to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)

update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

VerticesAndFaces

Bases: BaseDoc

Document for handling the tensor data of a Mesh3D object.

A VerticesAndFaces Document can contain:

  • an AnyTensor containing the vertices information (VerticesAndFaces.vertices)
  • an AnyTensor containing the faces information (VerticesAndFaces.faces)
Source code in docarray/documents/mesh/vertices_and_faces.py
class VerticesAndFaces(BaseDoc):
    """
    Document for handling the tensor data of a [`Mesh3D`][docarray.documents.mesh.Mesh3D] object.

    A VerticesAndFaces Document can contain:

    - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor)
    containing the vertices information (`VerticesAndFaces.vertices`)
    - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor)
    containing the faces information (`VerticesAndFaces.faces`)
    """

    vertices: AnyTensor
    faces: AnyTensor

    @classmethod
    def _docarray_validate(
        cls: Type[T],
        value: Union[str, Any],
    ) -> T:
        return super().validate(value)

    def display(self) -> None:
        """
        Plot mesh consisting of vertices and faces.
        """
        if TYPE_CHECKING:
            import trimesh
        else:
            trimesh = import_library('trimesh', raise_error=True)

        from IPython.display import display

        if self.vertices is None or self.faces is None:
            raise ValueError(
                'Can\'t display mesh from tensors when the vertices and/or faces '
                'are None.'
            )

        mesh = trimesh.Trimesh(vertices=self.vertices, faces=self.faces)
        display(mesh.show())

dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data

display()

Plot mesh consisting of vertices and faces.

Source code in docarray/documents/mesh/vertices_and_faces.py
def display(self) -> None:
    """
    Plot mesh consisting of vertices and faces.
    """
    if TYPE_CHECKING:
        import trimesh
    else:
        trimesh = import_library('trimesh', raise_error=True)

    from IPython.display import display

    if self.vertices is None or self.faces is None:
        raise ValueError(
            'Can\'t display mesh from tensors when the vertices and/or faces '
            'are None.'
        )

    mesh = trimesh.Trimesh(vertices=self.vertices, faces=self.faces)
    display(mesh.show())

from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)

from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )

from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)

from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)

json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)

parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )

schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)

summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()

to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')

to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)

to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)

update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

VideoDoc

Bases: BaseDoc

Document for handling video.

The Video Document can contain:

You can use this Document directly:

from docarray.documents import VideoDoc, AudioDoc

# use it directly
vid = VideoDoc(
    url='https://github.com/docarray/docarray/blob/main/tests/toydata/mov_bbb.mp4?raw=true'
)
tensor, audio_tensor, key_frame_indices = vid.url.load()
vid.tensor = tensor
vid.audio = AudioDoc(tensor=audio_tensor)
vid.key_frame_indices = key_frame_indices
# model = MyEmbeddingModel()
# vid.embedding = model(vid.tensor)

You can extend this Document:

from typing import Optional

from docarray.documents import TextDoc, VideoDoc


# extend it
class MyVideo(VideoDoc):
    name: Optional[TextDoc] = None


video = MyVideo(
    url='https://github.com/docarray/docarray/blob/main/tests/toydata/mov_bbb.mp4?raw=true'
)
video.name = TextDoc(text='my first video')
video.tensor = video.url.load().video
# model = MyEmbeddingModel()
# video.embedding = model(video.tensor)

You can use this Document for composition:

from docarray import BaseDoc
from docarray.documents import TextDoc, VideoDoc


# compose it
class MultiModalDoc(BaseDoc):
    video: VideoDoc
    text: TextDoc


mmdoc = MultiModalDoc(
    video=VideoDoc(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/mov_bbb.mp4?raw=true'
    ),
    text=TextDoc(text='hello world, how are you doing?'),
)
mmdoc.video.tensor = mmdoc.video.url.load().video

# or
mmdoc.video.bytes_ = mmdoc.video.url.load_bytes()
mmdoc.video.tensor = mmdoc.video.bytes_.load().video
Source code in docarray/documents/video.py
class VideoDoc(BaseDoc):
    """
    Document for handling video.

    The Video Document can contain:

    - a [`VideoUrl`][docarray.typing.url.VideoUrl] (`VideoDoc.url`)
    - an [`AudioDoc`][docarray.documents.AudioDoc] (`VideoDoc.audio`)
    - a [`VideoTensor`](../../../api_references/typing/tensor/video) (`VideoDoc.tensor`)
    - an [`AnyTensor`](../../../api_references/typing/tensor/tensor) representing the indices of the video's key frames (`VideoDoc.key_frame_indices`)
    - an [`AnyEmbedding`](../../../api_references/typing/tensor/embedding) (`VideoDoc.embedding`)
    - a [`VideoBytes`][docarray.typing.bytes.VideoBytes] object (`VideoDoc.bytes_`)

    You can use this Document directly:

    ```python
    from docarray.documents import VideoDoc, AudioDoc

    # use it directly
    vid = VideoDoc(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/mov_bbb.mp4?raw=true'
    )
    tensor, audio_tensor, key_frame_indices = vid.url.load()
    vid.tensor = tensor
    vid.audio = AudioDoc(tensor=audio_tensor)
    vid.key_frame_indices = key_frame_indices
    # model = MyEmbeddingModel()
    # vid.embedding = model(vid.tensor)
    ```

    You can extend this Document:

    ```python
    from typing import Optional

    from docarray.documents import TextDoc, VideoDoc


    # extend it
    class MyVideo(VideoDoc):
        name: Optional[TextDoc] = None


    video = MyVideo(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/mov_bbb.mp4?raw=true'
    )
    video.name = TextDoc(text='my first video')
    video.tensor = video.url.load().video
    # model = MyEmbeddingModel()
    # video.embedding = model(video.tensor)
    ```

    You can use this Document for composition:

    ```python
    from docarray import BaseDoc
    from docarray.documents import TextDoc, VideoDoc


    # compose it
    class MultiModalDoc(BaseDoc):
        video: VideoDoc
        text: TextDoc


    mmdoc = MultiModalDoc(
        video=VideoDoc(
            url='https://github.com/docarray/docarray/blob/main/tests/toydata/mov_bbb.mp4?raw=true'
        ),
        text=TextDoc(text='hello world, how are you doing?'),
    )
    mmdoc.video.tensor = mmdoc.video.url.load().video

    # or
    mmdoc.video.bytes_ = mmdoc.video.url.load_bytes()
    mmdoc.video.tensor = mmdoc.video.bytes_.load().video
    ```
    """

    url: Optional[VideoUrl] = Field(
        description='URL to a (potentially remote) video file that needs to be loaded',
        example='https://github.com/docarray/docarray/blob/main/tests/toydata/mov_bbb.mp4?raw=true',
        default=None,
    )
    audio: Optional[AudioDoc] = Field(
        description='Audio document associated with the video',
        default=None,
    )
    tensor: Optional[VideoTensor] = Field(
        description='Tensor object representing the video which be specified to one of `VideoNdArray`, `VideoTorchTensor`, `VideoTensorFlowTensor`',
        default=None,
    )
    key_frame_indices: Optional[AnyTensor] = Field(
        description='List of all the key frames in the video',
        example=[0, 1, 2, 3, 4],
        default=None,
    )
    embedding: Optional[AnyEmbedding] = Field(
        description='Store an embedding: a vector representation of the video',
        example=[1, 0, 1],
        default=None,
    )
    bytes_: Optional[VideoBytes] = Field(
        description='Bytes representation of the video',
        default=None,
    )

    @classmethod
    def _validate(cls, value) -> Dict[str, Any]:
        if isinstance(value, str):
            value = dict(url=value)
        elif isinstance(value, (AbstractTensor, np.ndarray)) or (
            torch is not None
            and isinstance(value, torch.Tensor)
            or (tf is not None and isinstance(value, tf.Tensor))
        ):
            value = dict(tensor=value)

        return value

    if is_pydantic_v2:

        @model_validator(mode='before')
        @classmethod
        def validate_model_before(cls, value):
            return cls._validate(value)

    else:

        @classmethod
        def validate(
            cls: Type[T],
            value: Union[str, AbstractTensor, Any],
        ) -> T:
            return super().validate(cls._validate(value))

dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data

from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)

from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )

from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)

from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)

json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)

parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )

schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)

summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()

to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')

to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)

to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)

update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

audio

AudioDoc

Bases: BaseDoc

Document for handling audios.

The Audio Document can contain:

You can use this Document directly:

from docarray.documents import AudioDoc

# use it directly
audio = AudioDoc(
    url='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.wav?raw=true'
)
audio.tensor, audio.frame_rate = audio.url.load()
# model = MyEmbeddingModel()
# audio.embedding = model(audio.tensor)

You can extend this Document:

from docarray.documents import AudioDoc, TextDoc
from typing import Optional


# extend it
class MyAudio(AudioDoc):
    name: Optional[TextDoc] = None


audio = MyAudio(
    url='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.wav?raw=true'
)
audio.name = TextDoc(text='my first audio')
audio.tensor, audio.frame_rate = audio.url.load()
# model = MyEmbeddingModel()
# audio.embedding = model(audio.tensor)

You can use this Document for composition:

from docarray import BaseDoc
from docarray.documents import AudioDoc, TextDoc


# compose it
class MultiModalDoc(BaseDoc):
    audio: AudioDoc
    text: TextDoc


mmdoc = MultiModalDoc(
    audio=AudioDoc(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.wav?raw=true'
    ),
    text=TextDoc(text='hello world, how are you doing?'),
)
mmdoc.audio.tensor, mmdoc.audio.frame_rate = mmdoc.audio.url.load()

# equivalent to
mmdoc.audio.bytes_ = mmdoc.audio.url.load_bytes()
mmdoc.audio.tensor, mmdoc.audio.frame_rate = mmdoc.audio.bytes_.load()
Source code in docarray/documents/audio.py
class AudioDoc(BaseDoc):
    """
    Document for handling audios.

    The Audio Document can contain:

    - an [`AudioUrl`][docarray.typing.url.AudioUrl] (`AudioDoc.url`)
    - an [`AudioTensor`](../../../api_references/typing/tensor/audio) (`AudioDoc.tensor`)
    - an [`AnyEmbedding`](../../../api_references/typing/tensor/embedding) (`AudioDoc.embedding`)
    - an [`AudioBytes`][docarray.typing.bytes.AudioBytes] (`AudioDoc.bytes_`) object
    - an integer representing the frame_rate (`AudioDoc.frame_rate`)

    You can use this Document directly:

    ```python
    from docarray.documents import AudioDoc

    # use it directly
    audio = AudioDoc(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.wav?raw=true'
    )
    audio.tensor, audio.frame_rate = audio.url.load()
    # model = MyEmbeddingModel()
    # audio.embedding = model(audio.tensor)
    ```

    You can extend this Document:

    ```python
    from docarray.documents import AudioDoc, TextDoc
    from typing import Optional


    # extend it
    class MyAudio(AudioDoc):
        name: Optional[TextDoc] = None


    audio = MyAudio(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.wav?raw=true'
    )
    audio.name = TextDoc(text='my first audio')
    audio.tensor, audio.frame_rate = audio.url.load()
    # model = MyEmbeddingModel()
    # audio.embedding = model(audio.tensor)
    ```

    You can use this Document for composition:

    ```python
    from docarray import BaseDoc
    from docarray.documents import AudioDoc, TextDoc


    # compose it
    class MultiModalDoc(BaseDoc):
        audio: AudioDoc
        text: TextDoc


    mmdoc = MultiModalDoc(
        audio=AudioDoc(
            url='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.wav?raw=true'
        ),
        text=TextDoc(text='hello world, how are you doing?'),
    )
    mmdoc.audio.tensor, mmdoc.audio.frame_rate = mmdoc.audio.url.load()

    # equivalent to
    mmdoc.audio.bytes_ = mmdoc.audio.url.load_bytes()
    mmdoc.audio.tensor, mmdoc.audio.frame_rate = mmdoc.audio.bytes_.load()
    ```
    """

    url: Optional[AudioUrl] = Field(
        description='The url to a (potentially remote) audio file that can be loaded',
        example='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.mp3?raw=true',
        default=None,
    )
    tensor: Optional[AudioTensor] = Field(
        description='Tensor object of the audio which can be specified to one of `AudioNdArray`, `AudioTorchTensor`, `AudioTensorFlowTensor`',
        default=None,
    )
    embedding: Optional[AnyEmbedding] = Field(
        description='Store an embedding: a vector representation of the audio.',
        example=[0, 1, 0],
        default=None,
    )
    bytes_: Optional[AudioBytes] = Field(
        description='Bytes representation pf the audio',
        default=None,
    )
    frame_rate: Optional[int] = Field(
        description='An integer representing the frame rate of the audio.',
        example=24,
        default=None,
    )

    @classmethod
    def _validate(cls, value) -> Dict[str, Any]:
        if isinstance(value, str):
            value = dict(url=value)
        elif isinstance(value, (AbstractTensor, np.ndarray)) or (
            torch is not None
            and isinstance(value, torch.Tensor)
            or (tf is not None and isinstance(value, tf.Tensor))
        ):
            value = dict(tensor=value)

        return value

    if is_pydantic_v2:

        @model_validator(mode='before')
        @classmethod
        def validate_model_before(cls, value):
            return cls._validate(value)

    else:

        @classmethod
        def validate(
            cls: Type[T],
            value: Union[str, AbstractTensor, Any],
        ) -> T:
            return super().validate(cls._validate(value))
dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data
from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)
from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)
from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)
json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)
parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )
schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)
summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()
to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')
to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)
to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)
update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

helper

create_doc(__model_name, *, __config__=None, __base__=BaseDoc, __module__=__name__, __validators__=None, __cls_kwargs__=None, __slots__=None, **field_definitions)

Dynamically create a subclass of BaseDoc. This is a wrapper around pydantic's create_model.

Note

To pickle a dynamically created BaseDoc subclass:

  • the class must be defined globally
  • it must provide __module__
from docarray.documents import Audio
from docarray.documents.helper import create_doc
from docarray.typing.tensor.audio import AudioNdArray

MyAudio = create_doc(
    'MyAudio',
    __base__=Audio,
    title=(str, ...),
    tensor=(AudioNdArray, ...),
)

assert safe_issubclass(MyAudio, BaseDoc)
assert safe_issubclass(MyAudio, Audio)

Parameters:

Name Type Description Default
__model_name str

name of the created model

required
__config__ Optional[Type[BaseConfig]]

config class to use for the new model

None
__base__ Type[T_doc]

base class for the new model to inherit from, must be BaseDoc or its subclass

BaseDoc
__module__ str

module of the created model

__name__
__validators__ Dict[str, AnyClassMethod]

a dict of method names and @validator class methods

None
__cls_kwargs__ Dict[str, Any]

a dict for class creation

None
__slots__ Optional[Tuple[str, ...]]

Deprecated, __slots__ should not be passed to create_model

None
field_definitions Any

fields of the model (or extra fields if a base is supplied) in the format <name>=(<type>, <default default>) or <name>=<default value>

{}

Returns:

Type Description
Type[T_doc]

the new Document class

Source code in docarray/documents/helper.py
def create_doc(
    __model_name: str,
    *,
    __config__: Optional[Type[BaseConfig]] = None,
    __base__: Type['T_doc'] = BaseDoc,  # type: ignore
    __module__: str = __name__,
    __validators__: Dict[str, 'AnyClassMethod'] = None,  # type: ignore
    __cls_kwargs__: Dict[str, Any] = None,  # type: ignore
    __slots__: Optional[Tuple[str, ...]] = None,
    **field_definitions: Any,
) -> Type['T_doc']:
    """
    Dynamically create a subclass of BaseDoc. This is a wrapper around pydantic's create_model.

    !!! note
        To pickle a dynamically created BaseDoc subclass:

        - the class must be defined globally
        - it must provide `__module__`

    ```python
    from docarray.documents import Audio
    from docarray.documents.helper import create_doc
    from docarray.typing.tensor.audio import AudioNdArray

    MyAudio = create_doc(
        'MyAudio',
        __base__=Audio,
        title=(str, ...),
        tensor=(AudioNdArray, ...),
    )

    assert safe_issubclass(MyAudio, BaseDoc)
    assert safe_issubclass(MyAudio, Audio)
    ```

    :param __model_name: name of the created model
    :param __config__: config class to use for the new model
    :param __base__: base class for the new model to inherit from, must be BaseDoc or its subclass
    :param __module__: module of the created model
    :param __validators__: a dict of method names and @validator class methods
    :param __cls_kwargs__: a dict for class creation
    :param __slots__: Deprecated, `__slots__` should not be passed to `create_model`
    :param field_definitions: fields of the model (or extra fields if a base is supplied)
        in the format `<name>=(<type>, <default default>)` or `<name>=<default value>`
    :return: the new Document class
    """

    if not safe_issubclass(__base__, BaseDoc):
        raise ValueError(f'{type(__base__)} is not a BaseDoc or its subclass')

    doc = create_model(
        __model_name,
        __config__=__config__,
        __base__=__base__,
        __module__=__module__,
        __validators__=__validators__,
        __cls_kwargs__=__cls_kwargs__,
        __slots__=__slots__,
        **field_definitions,
    )

    return doc

create_doc_from_dict(model_name, data_dict)

Create a subclass of BaseDoc based on example data given as a dictionary.

In case the example contains None as a value, corresponding field will be viewed as the type Any.


import numpy as np
from docarray.documents import ImageDoc
from docarray.documents.helper import create_doc_from_dict

data_dict = {'image': ImageDoc(tensor=np.random.rand(3, 224, 224)), 'author': 'me'}

MyDoc = create_doc_from_dict(model_name='MyDoc', data_dict=data_dict)

assert safe_issubclass(MyDoc, BaseDoc)

Parameters:

Name Type Description Default
model_name str

Name of the new Document class

required
data_dict Dict[str, Any]

Dictionary of field types to their corresponding values.

required

Returns:

Type Description
Type[T_doc]

the new Document class

Source code in docarray/documents/helper.py
def create_doc_from_dict(model_name: str, data_dict: Dict[str, Any]) -> Type['T_doc']:
    """
    Create a subclass of BaseDoc based on example data given as a dictionary.

    In case the example contains None as a value,
    corresponding field will be viewed as the type Any.

    ---

    ```python
    import numpy as np
    from docarray.documents import ImageDoc
    from docarray.documents.helper import create_doc_from_dict

    data_dict = {'image': ImageDoc(tensor=np.random.rand(3, 224, 224)), 'author': 'me'}

    MyDoc = create_doc_from_dict(model_name='MyDoc', data_dict=data_dict)

    assert safe_issubclass(MyDoc, BaseDoc)
    ```

    ---

    :param model_name: Name of the new Document class
    :param data_dict: Dictionary of field types to their corresponding values.
    :return: the new Document class
    """
    if not data_dict:
        raise ValueError('`data_dict` should contain at least one item')

    field_types = {
        field: (type(value) if value else Any, ...)
        for field, value in data_dict.items()
    }
    return create_doc(__model_name=model_name, **field_types)  # type: ignore

create_doc_from_typeddict(typeddict_cls, **kwargs)

Create a subclass of BaseDoc based on the fields of a TypedDict. This is a wrapper around pydantic's create_model_from_typeddict.


from typing_extensions import TypedDict

from docarray import BaseDoc
from docarray.documents import Audio
from docarray.documents.helper import create_doc_from_typeddict
from docarray.typing.tensor.audio import AudioNdArray


class MyAudio(TypedDict):
    title: str
    tensor: AudioNdArray


Doc = create_doc_from_typeddict(MyAudio, __base__=Audio)

assert safe_issubclass(Doc, BaseDoc)
assert safe_issubclass(Doc, Audio)

Parameters:

Name Type Description Default
typeddict_cls Type[TypedDict]

TypedDict class to use for the new Document class

required
kwargs Any

extra arguments to pass to create_model_from_typeddict

{}

Returns:

Type Description

the new Document class

Source code in docarray/documents/helper.py
def create_doc_from_typeddict(
    typeddict_cls: Type['TypedDict'],  # type: ignore
    **kwargs: Any,
):
    """
    Create a subclass of BaseDoc based on the fields of a `TypedDict`. This is a wrapper around pydantic's create_model_from_typeddict.

    ---

    ```python
    from typing_extensions import TypedDict

    from docarray import BaseDoc
    from docarray.documents import Audio
    from docarray.documents.helper import create_doc_from_typeddict
    from docarray.typing.tensor.audio import AudioNdArray


    class MyAudio(TypedDict):
        title: str
        tensor: AudioNdArray


    Doc = create_doc_from_typeddict(MyAudio, __base__=Audio)

    assert safe_issubclass(Doc, BaseDoc)
    assert safe_issubclass(Doc, Audio)
    ```

    ---

    :param typeddict_cls: TypedDict class to use for the new Document class
    :param kwargs: extra arguments to pass to `create_model_from_typeddict`
    :return: the new Document class
    """

    if '__base__' in kwargs:
        if not safe_issubclass(kwargs['__base__'], BaseDoc):
            raise ValueError(f'{kwargs["__base__"]} is not a BaseDoc or its subclass')
    else:
        kwargs['__base__'] = BaseDoc

    doc = create_model_from_typeddict(typeddict_cls, **kwargs)

    return doc

image

ImageDoc

Bases: BaseDoc

Document for handling images.

It can contain:

You can use this Document directly:

from docarray.documents import ImageDoc

# use it directly
image = ImageDoc(
    url='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true'
)
image.tensor = image.url.load()
# model = MyEmbeddingModel()
# image.embedding = model(image.tensor)

You can extend this Document:

from docarray.documents import ImageDoc
from docarray.typing import AnyEmbedding
from typing import Optional


# extend it
class MyImage(ImageDoc):
    second_embedding: Optional[AnyEmbedding] = None


image = MyImage(
    url='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true'
)
image.tensor = image.url.load()
# model = MyEmbeddingModel()
# image.embedding = model(image.tensor)
# image.second_embedding = model(image.tensor)

You can use this Document for composition:

from docarray import BaseDoc
from docarray.documents import ImageDoc, TextDoc


# compose it
class MultiModalDoc(BaseDoc):
    image: ImageDoc
    text: TextDoc


mmdoc = MultiModalDoc(
    image=ImageDoc(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true'
    ),
    text=TextDoc(text='hello world, how are you doing?'),
)
mmdoc.image.tensor = mmdoc.image.url.load()

# or
mmdoc.image.bytes_ = mmdoc.image.url.load_bytes()
mmdoc.image.tensor = mmdoc.image.bytes_.load()
Source code in docarray/documents/image.py
class ImageDoc(BaseDoc):
    """
    Document for handling images.

    It can contain:

    - an [`ImageUrl`][docarray.typing.url.ImageUrl] (`Image.url`)
    - an [`ImageTensor`](../../../api_references/typing/tensor/image) (`Image.tensor`)
    - an [`AnyEmbedding`](../../../api_references/typing/tensor/embedding) (`Image.embedding`)
    - an [`ImageBytes`][docarray.typing.bytes.ImageBytes] object (`ImageDoc.bytes_`)

    You can use this Document directly:

    ```python
    from docarray.documents import ImageDoc

    # use it directly
    image = ImageDoc(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true'
    )
    image.tensor = image.url.load()
    # model = MyEmbeddingModel()
    # image.embedding = model(image.tensor)
    ```

    You can extend this Document:

    ```python
    from docarray.documents import ImageDoc
    from docarray.typing import AnyEmbedding
    from typing import Optional


    # extend it
    class MyImage(ImageDoc):
        second_embedding: Optional[AnyEmbedding] = None


    image = MyImage(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true'
    )
    image.tensor = image.url.load()
    # model = MyEmbeddingModel()
    # image.embedding = model(image.tensor)
    # image.second_embedding = model(image.tensor)
    ```

    You can use this Document for composition:

    ```python
    from docarray import BaseDoc
    from docarray.documents import ImageDoc, TextDoc


    # compose it
    class MultiModalDoc(BaseDoc):
        image: ImageDoc
        text: TextDoc


    mmdoc = MultiModalDoc(
        image=ImageDoc(
            url='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true'
        ),
        text=TextDoc(text='hello world, how are you doing?'),
    )
    mmdoc.image.tensor = mmdoc.image.url.load()

    # or
    mmdoc.image.bytes_ = mmdoc.image.url.load_bytes()
    mmdoc.image.tensor = mmdoc.image.bytes_.load()
    ```
    """

    url: Optional[ImageUrl] = Field(
        description='URL to a (potentially remote) image file that needs to be loaded',
        example='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true',
        default=None,
    )
    tensor: Optional[ImageTensor] = Field(
        description='Tensor object of the image which can be specifed to one of `ImageNdArray`, `ImageTorchTensor`, `ImageTensorflowTensor`.',
        default=None,
    )
    embedding: Optional[AnyEmbedding] = Field(
        description='Store an embedding: a vector representation of the image.',
        example=[1, 0, 1],
        default=None,
    )
    bytes_: Optional[ImageBytes] = Field(
        description='Bytes object of the image which is an instance of `ImageBytes`.',
        default=None,
    )

    @classmethod
    def _validate(cls, value) -> Dict[str, Any]:
        if isinstance(value, str):
            value = dict(url=value)
        elif (
            isinstance(value, (AbstractTensor, np.ndarray))
            or (torch is not None and isinstance(value, torch.Tensor))
            or (tf is not None and isinstance(value, tf.Tensor))
        ):
            value = dict(tensor=value)
        elif isinstance(value, bytes):
            value = dict(byte=value)

        return value

    if is_pydantic_v2:

        @model_validator(mode='before')
        @classmethod
        def validate_model_before(cls, value):
            return cls._validate(value)

    else:

        @classmethod
        def validate(
            cls: Type[T],
            value: Union[str, AbstractTensor, Any],
        ) -> T:
            return super().validate(cls._validate(value))
dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data
from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)
from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)
from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)
json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)
parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )
schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)
summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()
to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')
to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)
to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)
update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

legacy

LegacyDocument

Bases: BaseDoc

This Document is the LegacyDocument. It follows the same schema as in DocArray <=0.21. It can be useful to start migrating a codebase from v1 to v2.

Nevertheless, the API is not totally compatible with DocArray <=0.21 Document. Indeed, none of the method associated with Document are present. Only the schema of the data is similar.

from docarray import DocList
from docarray.documents.legacy import LegacyDocument
import numpy as np

doc = LegacyDocument(text='hello')
doc.url = 'http://myimg.png'
doc.tensor = np.zeros((3, 224, 224))
doc.embedding = np.zeros((100, 1))

doc.tags['price'] = 10

doc.chunks = DocList[Document]([Document() for _ in range(10)])

doc.chunks = DocList[Document]([Document() for _ in range(10)])
Source code in docarray/documents/legacy/legacy_document.py
class LegacyDocument(BaseDoc):
    """
    This Document is the LegacyDocument. It follows the same schema as in DocArray <=0.21.
    It can be useful to start migrating a codebase from v1 to v2.

    Nevertheless, the API is not totally compatible with DocArray <=0.21 `Document`.
    Indeed, none of the method associated with `Document` are present. Only the schema
    of the data is similar.

    ```python
    from docarray import DocList
    from docarray.documents.legacy import LegacyDocument
    import numpy as np

    doc = LegacyDocument(text='hello')
    doc.url = 'http://myimg.png'
    doc.tensor = np.zeros((3, 224, 224))
    doc.embedding = np.zeros((100, 1))

    doc.tags['price'] = 10

    doc.chunks = DocList[Document]([Document() for _ in range(10)])

    doc.chunks = DocList[Document]([Document() for _ in range(10)])
    ```

    """

    tensor: Optional[AnyTensor] = None
    chunks: Optional[DocList[LegacyDocument]] = None
    matches: Optional[DocList[LegacyDocument]] = None
    blob: Optional[bytes] = None
    text: Optional[str] = None
    url: Optional[str] = None
    embedding: Optional[AnyEmbedding] = None
    tags: Dict[str, Any] = dict()
    scores: Optional[Dict[str, Any]] = None
dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data
from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)
from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)
from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)
json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)
parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )
schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)
summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()
to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')
to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)
to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)
update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

legacy_document

LegacyDocument

Bases: BaseDoc

This Document is the LegacyDocument. It follows the same schema as in DocArray <=0.21. It can be useful to start migrating a codebase from v1 to v2.

Nevertheless, the API is not totally compatible with DocArray <=0.21 Document. Indeed, none of the method associated with Document are present. Only the schema of the data is similar.

from docarray import DocList
from docarray.documents.legacy import LegacyDocument
import numpy as np

doc = LegacyDocument(text='hello')
doc.url = 'http://myimg.png'
doc.tensor = np.zeros((3, 224, 224))
doc.embedding = np.zeros((100, 1))

doc.tags['price'] = 10

doc.chunks = DocList[Document]([Document() for _ in range(10)])

doc.chunks = DocList[Document]([Document() for _ in range(10)])
Source code in docarray/documents/legacy/legacy_document.py
class LegacyDocument(BaseDoc):
    """
    This Document is the LegacyDocument. It follows the same schema as in DocArray <=0.21.
    It can be useful to start migrating a codebase from v1 to v2.

    Nevertheless, the API is not totally compatible with DocArray <=0.21 `Document`.
    Indeed, none of the method associated with `Document` are present. Only the schema
    of the data is similar.

    ```python
    from docarray import DocList
    from docarray.documents.legacy import LegacyDocument
    import numpy as np

    doc = LegacyDocument(text='hello')
    doc.url = 'http://myimg.png'
    doc.tensor = np.zeros((3, 224, 224))
    doc.embedding = np.zeros((100, 1))

    doc.tags['price'] = 10

    doc.chunks = DocList[Document]([Document() for _ in range(10)])

    doc.chunks = DocList[Document]([Document() for _ in range(10)])
    ```

    """

    tensor: Optional[AnyTensor] = None
    chunks: Optional[DocList[LegacyDocument]] = None
    matches: Optional[DocList[LegacyDocument]] = None
    blob: Optional[bytes] = None
    text: Optional[str] = None
    url: Optional[str] = None
    embedding: Optional[AnyEmbedding] = None
    tags: Dict[str, Any] = dict()
    scores: Optional[Dict[str, Any]] = None
dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data
from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)
from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)
from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)
json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)
parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )
schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)
summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()
to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')
to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)
to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)
update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

mesh

Mesh3D

Bases: BaseDoc

Document for handling meshes for 3D data representation.

A mesh is a representation for 3D data and contains vertices and faces information. Vertices are points in a 3D space, represented as a tensor of shape (n_points, 3). Faces are triangular surfaces that can be defined by three points in 3D space, corresponding to the three vertices of a triangle. Faces can be represented as a tensor of shape (n_faces, 3). Each number in that tensor refers to an index of a vertex in the tensor of vertices.

The Mesh3D Document can contain:

You can use this Document directly:

from docarray.documents import Mesh3D

# use it directly
mesh = Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
mesh.tensors = mesh.url.load()
# model = MyEmbeddingModel()
# mesh.embedding = model(mesh.tensors.vertices)

You can extend this Document:

from docarray.documents import Mesh3D
from docarray.typing import AnyEmbedding
from typing import Optional


# extend it
class MyMesh3D(Mesh3D):
    name: Optional[str] = None


mesh = MyMesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
mesh.name = 'my first mesh'
mesh.tensors = mesh.url.load()
# model = MyEmbeddingModel()
# mesh.embedding = model(mesh.vertices)

You can use this Document for composition:

from docarray import BaseDoc
from docarray.documents import Mesh3D, TextDoc


# compose it
class MultiModalDoc(BaseDoc):
    mesh: Mesh3D
    text: TextDoc


mmdoc = MultiModalDoc(
    mesh=Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj'),
    text=TextDoc(text='hello world, how are you doing?'),
)
mmdoc.mesh.tensors = mmdoc.mesh.url.load()

# or
mmdoc.mesh.bytes_ = mmdoc.mesh.url.load_bytes()

You can display your 3D mesh in a notebook from either its url, or its tensors:

from docarray.documents import Mesh3D

# display from url
mesh = Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
# mesh.url.display()

# display from tensors
mesh.tensors = mesh.url.load()
# mesh.tensors.display()
Source code in docarray/documents/mesh/mesh_3d.py
class Mesh3D(BaseDoc):
    """
    Document for handling meshes for 3D data representation.

    A mesh is a representation for 3D data and contains vertices and faces information.
    Vertices are points in a 3D space, represented as a tensor of shape (n_points, 3).
    Faces are triangular surfaces that can be defined by three points in 3D space,
    corresponding to the three vertices of a triangle. Faces can be represented as a
    tensor of shape (n_faces, 3). Each number in that tensor refers to an index of a
    vertex in the tensor of vertices.

    The Mesh3D Document can contain:

    - an [`Mesh3DUrl`][docarray.typing.url.Mesh3DUrl] (`Mesh3D.url`)
    - a [`VerticesAndFaces`][docarray.documents.mesh.vertices_and_faces.VerticesAndFaces]
    object containing:

        - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor) of
        vertices (`Mesh3D.tensors.vertices`)
        - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor) of faces (`Mesh3D.tensors.faces`)

    - an [`AnyEmbedding`](../../../../api_references/typing/tensor/embedding) (`Mesh3D.embedding`)
    - a `bytes` object (`Mesh3D.bytes_`).

    You can use this Document directly:

    ```python
    from docarray.documents import Mesh3D

    # use it directly
    mesh = Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
    mesh.tensors = mesh.url.load()
    # model = MyEmbeddingModel()
    # mesh.embedding = model(mesh.tensors.vertices)
    ```

    You can extend this Document:

    ```python
    from docarray.documents import Mesh3D
    from docarray.typing import AnyEmbedding
    from typing import Optional


    # extend it
    class MyMesh3D(Mesh3D):
        name: Optional[str] = None


    mesh = MyMesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
    mesh.name = 'my first mesh'
    mesh.tensors = mesh.url.load()
    # model = MyEmbeddingModel()
    # mesh.embedding = model(mesh.vertices)
    ```

    You can use this Document for composition:

    ```python
    from docarray import BaseDoc
    from docarray.documents import Mesh3D, TextDoc


    # compose it
    class MultiModalDoc(BaseDoc):
        mesh: Mesh3D
        text: TextDoc


    mmdoc = MultiModalDoc(
        mesh=Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj'),
        text=TextDoc(text='hello world, how are you doing?'),
    )
    mmdoc.mesh.tensors = mmdoc.mesh.url.load()

    # or
    mmdoc.mesh.bytes_ = mmdoc.mesh.url.load_bytes()
    ```

    You can display your 3D mesh in a notebook from either its url, or its tensors:

    ```python
    from docarray.documents import Mesh3D

    # display from url
    mesh = Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
    # mesh.url.display()

    # display from tensors
    mesh.tensors = mesh.url.load()
    # mesh.tensors.display()
    ```

    """

    url: Optional[Mesh3DUrl] = Field(
        description='URL to a file containing 3D mesh information. Can be remote (web) URL, or a local file path.',
        example='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj',
        default=None,
    )
    tensors: Optional[VerticesAndFaces] = Field(
        description='A tensor object of 3D mesh of type `VerticesAndFaces`.',
        example=[[0, 1, 1], [1, 0, 1], [1, 1, 0]],
        default=None,
    )
    embedding: Optional[AnyEmbedding] = Field(
        description='Store an embedding: a vector representation of the 3D mesh.',
        default=[1, 0, 1],
    )
    bytes_: Optional[bytes] = Field(
        description='Bytes representation of 3D mesh.',
        default=None,
    )

    if is_pydantic_v2:

        @model_validator(mode='before')
        @classmethod
        def validate_model_before(cls, value):
            if isinstance(value, str):
                return {'url': value}
            return value

    else:

        @classmethod
        def validate(
            cls: Type[T],
            value: Union[str, Any],
        ) -> T:
            if isinstance(value, str):
                value = cls(url=value)
            return super().validate(value)
dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data
from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)
from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)
from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)
json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)
parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )
schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)
summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()
to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')
to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)
to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)
update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

VerticesAndFaces

Bases: BaseDoc

Document for handling the tensor data of a Mesh3D object.

A VerticesAndFaces Document can contain:

  • an AnyTensor containing the vertices information (VerticesAndFaces.vertices)
  • an AnyTensor containing the faces information (VerticesAndFaces.faces)
Source code in docarray/documents/mesh/vertices_and_faces.py
class VerticesAndFaces(BaseDoc):
    """
    Document for handling the tensor data of a [`Mesh3D`][docarray.documents.mesh.Mesh3D] object.

    A VerticesAndFaces Document can contain:

    - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor)
    containing the vertices information (`VerticesAndFaces.vertices`)
    - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor)
    containing the faces information (`VerticesAndFaces.faces`)
    """

    vertices: AnyTensor
    faces: AnyTensor

    @classmethod
    def _docarray_validate(
        cls: Type[T],
        value: Union[str, Any],
    ) -> T:
        return super().validate(value)

    def display(self) -> None:
        """
        Plot mesh consisting of vertices and faces.
        """
        if TYPE_CHECKING:
            import trimesh
        else:
            trimesh = import_library('trimesh', raise_error=True)

        from IPython.display import display

        if self.vertices is None or self.faces is None:
            raise ValueError(
                'Can\'t display mesh from tensors when the vertices and/or faces '
                'are None.'
            )

        mesh = trimesh.Trimesh(vertices=self.vertices, faces=self.faces)
        display(mesh.show())
dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data
display()

Plot mesh consisting of vertices and faces.

Source code in docarray/documents/mesh/vertices_and_faces.py
def display(self) -> None:
    """
    Plot mesh consisting of vertices and faces.
    """
    if TYPE_CHECKING:
        import trimesh
    else:
        trimesh = import_library('trimesh', raise_error=True)

    from IPython.display import display

    if self.vertices is None or self.faces is None:
        raise ValueError(
            'Can\'t display mesh from tensors when the vertices and/or faces '
            'are None.'
        )

    mesh = trimesh.Trimesh(vertices=self.vertices, faces=self.faces)
    display(mesh.show())
from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)
from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)
from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)
json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)
parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )
schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)
summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()
to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')
to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)
to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)
update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

mesh_3d

Mesh3D

Bases: BaseDoc

Document for handling meshes for 3D data representation.

A mesh is a representation for 3D data and contains vertices and faces information. Vertices are points in a 3D space, represented as a tensor of shape (n_points, 3). Faces are triangular surfaces that can be defined by three points in 3D space, corresponding to the three vertices of a triangle. Faces can be represented as a tensor of shape (n_faces, 3). Each number in that tensor refers to an index of a vertex in the tensor of vertices.

The Mesh3D Document can contain:

You can use this Document directly:

from docarray.documents import Mesh3D

# use it directly
mesh = Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
mesh.tensors = mesh.url.load()
# model = MyEmbeddingModel()
# mesh.embedding = model(mesh.tensors.vertices)

You can extend this Document:

from docarray.documents import Mesh3D
from docarray.typing import AnyEmbedding
from typing import Optional


# extend it
class MyMesh3D(Mesh3D):
    name: Optional[str] = None


mesh = MyMesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
mesh.name = 'my first mesh'
mesh.tensors = mesh.url.load()
# model = MyEmbeddingModel()
# mesh.embedding = model(mesh.vertices)

You can use this Document for composition:

from docarray import BaseDoc
from docarray.documents import Mesh3D, TextDoc


# compose it
class MultiModalDoc(BaseDoc):
    mesh: Mesh3D
    text: TextDoc


mmdoc = MultiModalDoc(
    mesh=Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj'),
    text=TextDoc(text='hello world, how are you doing?'),
)
mmdoc.mesh.tensors = mmdoc.mesh.url.load()

# or
mmdoc.mesh.bytes_ = mmdoc.mesh.url.load_bytes()

You can display your 3D mesh in a notebook from either its url, or its tensors:

from docarray.documents import Mesh3D

# display from url
mesh = Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
# mesh.url.display()

# display from tensors
mesh.tensors = mesh.url.load()
# mesh.tensors.display()
Source code in docarray/documents/mesh/mesh_3d.py
class Mesh3D(BaseDoc):
    """
    Document for handling meshes for 3D data representation.

    A mesh is a representation for 3D data and contains vertices and faces information.
    Vertices are points in a 3D space, represented as a tensor of shape (n_points, 3).
    Faces are triangular surfaces that can be defined by three points in 3D space,
    corresponding to the three vertices of a triangle. Faces can be represented as a
    tensor of shape (n_faces, 3). Each number in that tensor refers to an index of a
    vertex in the tensor of vertices.

    The Mesh3D Document can contain:

    - an [`Mesh3DUrl`][docarray.typing.url.Mesh3DUrl] (`Mesh3D.url`)
    - a [`VerticesAndFaces`][docarray.documents.mesh.vertices_and_faces.VerticesAndFaces]
    object containing:

        - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor) of
        vertices (`Mesh3D.tensors.vertices`)
        - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor) of faces (`Mesh3D.tensors.faces`)

    - an [`AnyEmbedding`](../../../../api_references/typing/tensor/embedding) (`Mesh3D.embedding`)
    - a `bytes` object (`Mesh3D.bytes_`).

    You can use this Document directly:

    ```python
    from docarray.documents import Mesh3D

    # use it directly
    mesh = Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
    mesh.tensors = mesh.url.load()
    # model = MyEmbeddingModel()
    # mesh.embedding = model(mesh.tensors.vertices)
    ```

    You can extend this Document:

    ```python
    from docarray.documents import Mesh3D
    from docarray.typing import AnyEmbedding
    from typing import Optional


    # extend it
    class MyMesh3D(Mesh3D):
        name: Optional[str] = None


    mesh = MyMesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
    mesh.name = 'my first mesh'
    mesh.tensors = mesh.url.load()
    # model = MyEmbeddingModel()
    # mesh.embedding = model(mesh.vertices)
    ```

    You can use this Document for composition:

    ```python
    from docarray import BaseDoc
    from docarray.documents import Mesh3D, TextDoc


    # compose it
    class MultiModalDoc(BaseDoc):
        mesh: Mesh3D
        text: TextDoc


    mmdoc = MultiModalDoc(
        mesh=Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj'),
        text=TextDoc(text='hello world, how are you doing?'),
    )
    mmdoc.mesh.tensors = mmdoc.mesh.url.load()

    # or
    mmdoc.mesh.bytes_ = mmdoc.mesh.url.load_bytes()
    ```

    You can display your 3D mesh in a notebook from either its url, or its tensors:

    ```python
    from docarray.documents import Mesh3D

    # display from url
    mesh = Mesh3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
    # mesh.url.display()

    # display from tensors
    mesh.tensors = mesh.url.load()
    # mesh.tensors.display()
    ```

    """

    url: Optional[Mesh3DUrl] = Field(
        description='URL to a file containing 3D mesh information. Can be remote (web) URL, or a local file path.',
        example='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj',
        default=None,
    )
    tensors: Optional[VerticesAndFaces] = Field(
        description='A tensor object of 3D mesh of type `VerticesAndFaces`.',
        example=[[0, 1, 1], [1, 0, 1], [1, 1, 0]],
        default=None,
    )
    embedding: Optional[AnyEmbedding] = Field(
        description='Store an embedding: a vector representation of the 3D mesh.',
        default=[1, 0, 1],
    )
    bytes_: Optional[bytes] = Field(
        description='Bytes representation of 3D mesh.',
        default=None,
    )

    if is_pydantic_v2:

        @model_validator(mode='before')
        @classmethod
        def validate_model_before(cls, value):
            if isinstance(value, str):
                return {'url': value}
            return value

    else:

        @classmethod
        def validate(
            cls: Type[T],
            value: Union[str, Any],
        ) -> T:
            if isinstance(value, str):
                value = cls(url=value)
            return super().validate(value)
dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data
from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)
from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)
from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)
json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)
parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )
schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)
summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()
to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')
to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)
to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)
update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

vertices_and_faces

VerticesAndFaces

Bases: BaseDoc

Document for handling the tensor data of a Mesh3D object.

A VerticesAndFaces Document can contain:

  • an AnyTensor containing the vertices information (VerticesAndFaces.vertices)
  • an AnyTensor containing the faces information (VerticesAndFaces.faces)
Source code in docarray/documents/mesh/vertices_and_faces.py
class VerticesAndFaces(BaseDoc):
    """
    Document for handling the tensor data of a [`Mesh3D`][docarray.documents.mesh.Mesh3D] object.

    A VerticesAndFaces Document can contain:

    - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor)
    containing the vertices information (`VerticesAndFaces.vertices`)
    - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor)
    containing the faces information (`VerticesAndFaces.faces`)
    """

    vertices: AnyTensor
    faces: AnyTensor

    @classmethod
    def _docarray_validate(
        cls: Type[T],
        value: Union[str, Any],
    ) -> T:
        return super().validate(value)

    def display(self) -> None:
        """
        Plot mesh consisting of vertices and faces.
        """
        if TYPE_CHECKING:
            import trimesh
        else:
            trimesh = import_library('trimesh', raise_error=True)

        from IPython.display import display

        if self.vertices is None or self.faces is None:
            raise ValueError(
                'Can\'t display mesh from tensors when the vertices and/or faces '
                'are None.'
            )

        mesh = trimesh.Trimesh(vertices=self.vertices, faces=self.faces)
        display(mesh.show())
dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data
display()

Plot mesh consisting of vertices and faces.

Source code in docarray/documents/mesh/vertices_and_faces.py
def display(self) -> None:
    """
    Plot mesh consisting of vertices and faces.
    """
    if TYPE_CHECKING:
        import trimesh
    else:
        trimesh = import_library('trimesh', raise_error=True)

    from IPython.display import display

    if self.vertices is None or self.faces is None:
        raise ValueError(
            'Can\'t display mesh from tensors when the vertices and/or faces '
            'are None.'
        )

    mesh = trimesh.Trimesh(vertices=self.vertices, faces=self.faces)
    display(mesh.show())
from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)
from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)
from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)
json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)
parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )
schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)
summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()
to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')
to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)
to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)
update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

point_cloud

PointCloud3D

Bases: BaseDoc

Document for handling point clouds for 3D data representation.

Point cloud is a representation of a 3D mesh. It is made by repeatedly and uniformly sampling points within the surface of the 3D body. Compared to the mesh representation, the point cloud is a fixed size ndarray of shape (n_samples, 3) and hence easier for deep learning algorithms to handle.

A PointCloud3D Document can contain:

You can use this Document directly:

from docarray.documents import PointCloud3D

# use it directly
pc = PointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
pc.tensors = pc.url.load(samples=100)
# model = MyEmbeddingModel()
# pc.embedding = model(pc.tensors.points)

You can extend this Document:

from docarray.documents import PointCloud3D
from docarray.typing import AnyEmbedding
from typing import Optional


# extend it
class MyPointCloud3D(PointCloud3D):
    second_embedding: Optional[AnyEmbedding] = None


pc = MyPointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
pc.tensors = pc.url.load(samples=100)
# model = MyEmbeddingModel()
# pc.embedding = model(pc.tensors.points)
# pc.second_embedding = model(pc.tensors.colors)

You can use this Document for composition:

from docarray import BaseDoc
from docarray.documents import PointCloud3D, TextDoc


# compose it
class MultiModalDoc(BaseDoc):
    point_cloud: PointCloud3D
    text: TextDoc


mmdoc = MultiModalDoc(
    point_cloud=PointCloud3D(
        url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj'
    ),
    text=TextDoc(text='hello world, how are you doing?'),
)
mmdoc.point_cloud.tensors = mmdoc.point_cloud.url.load(samples=100)

# or
mmdoc.point_cloud.bytes_ = mmdoc.point_cloud.url.load_bytes()

You can display your point cloud from either its url, or its tensors:

from docarray.documents import PointCloud3D

# display from url
pc = PointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
# pc.url.display()

# display from tensors
pc.tensors = pc.url.load(samples=10000)
# pc.tensors.display()
Source code in docarray/documents/point_cloud/point_cloud_3d.py
class PointCloud3D(BaseDoc):
    """
    Document for handling point clouds for 3D data representation.

    Point cloud is a representation of a 3D mesh. It is made by repeatedly and uniformly
    sampling points within the surface of the 3D body. Compared to the mesh
    representation, the point cloud is a fixed size ndarray of shape `(n_samples, 3)` and
    hence easier for deep learning algorithms to handle.

    A PointCloud3D Document can contain:

    - a [`PointCloud3DUrl`][docarray.typing.url.PointCloud3DUrl] (`PointCloud3D.url`)
    - a [`PointsAndColors`][docarray.documents.point_cloud.points_and_colors.PointsAndColors] object (`PointCloud3D.tensors`)
    - an [`AnyEmbedding`](../../../../api_references/typing/tensor/embedding) (`PointCloud3D.embedding`)
    - a `bytes` object (`PointCloud3D.bytes_`)

    You can use this Document directly:

    ```python
    from docarray.documents import PointCloud3D

    # use it directly
    pc = PointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
    pc.tensors = pc.url.load(samples=100)
    # model = MyEmbeddingModel()
    # pc.embedding = model(pc.tensors.points)
    ```

    You can extend this Document:

    ```python
    from docarray.documents import PointCloud3D
    from docarray.typing import AnyEmbedding
    from typing import Optional


    # extend it
    class MyPointCloud3D(PointCloud3D):
        second_embedding: Optional[AnyEmbedding] = None


    pc = MyPointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
    pc.tensors = pc.url.load(samples=100)
    # model = MyEmbeddingModel()
    # pc.embedding = model(pc.tensors.points)
    # pc.second_embedding = model(pc.tensors.colors)
    ```

    You can use this Document for composition:

    ```python
    from docarray import BaseDoc
    from docarray.documents import PointCloud3D, TextDoc


    # compose it
    class MultiModalDoc(BaseDoc):
        point_cloud: PointCloud3D
        text: TextDoc


    mmdoc = MultiModalDoc(
        point_cloud=PointCloud3D(
            url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj'
        ),
        text=TextDoc(text='hello world, how are you doing?'),
    )
    mmdoc.point_cloud.tensors = mmdoc.point_cloud.url.load(samples=100)

    # or
    mmdoc.point_cloud.bytes_ = mmdoc.point_cloud.url.load_bytes()
    ```

    You can display your point cloud from either its url, or its tensors:

    ```python
    from docarray.documents import PointCloud3D

    # display from url
    pc = PointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
    # pc.url.display()

    # display from tensors
    pc.tensors = pc.url.load(samples=10000)
    # pc.tensors.display()
    ```
    """

    url: Optional[PointCloud3DUrl] = Field(
        description='URL to a file containing point cloud information. Can be remote (web) URL, or a local file path.',
        example='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj',
        default=None,
    )
    tensors: Optional[PointsAndColors] = Field(
        description='A tensor object of 3D point cloud of type `PointsAndColors`.',
        example=[[0, 0, 1], [1, 0, 1], [0, 1, 1]],
        default=None,
    )
    embedding: Optional[AnyEmbedding] = Field(
        description='Store an embedding: a vector representation of 3D point cloud.',
        example=[1, 1, 1],
        default=None,
    )
    bytes_: Optional[bytes] = Field(
        description='Bytes representation of 3D point cloud.',
        default=None,
    )

    @classmethod
    def _validate(self, value: Union[str, AbstractTensor, Any]) -> Any:
        if isinstance(value, str):
            value = {'url': value}
        elif isinstance(value, (AbstractTensor, np.ndarray)) or (
            torch is not None
            and isinstance(value, torch.Tensor)
            or (tf is not None and isinstance(value, tf.Tensor))
        ):
            value = {'tensors': PointsAndColors(points=value)}

        return value

    if is_pydantic_v2:

        @model_validator(mode='before')
        @classmethod
        def validate_model_before(cls, value):
            return cls._validate(value)

    else:

        @classmethod
        def validate(
            cls: Type[T],
            value: Union[str, AbstractTensor, Any],
        ) -> T:
            return super().validate(cls._validate(value))
dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data
from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)
from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)
from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)
json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)
parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )
schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)
summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()
to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')
to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)
to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)
update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

PointsAndColors

Bases: BaseDoc

Document for handling the tensor data of a PointCloud3D object.

A PointsAndColors Document can contain:

  • an AnyTensor containing the points in 3D space information (PointsAndColors.points)
  • an AnyTensor containing the points' color information (PointsAndColors.colors)
Source code in docarray/documents/point_cloud/points_and_colors.py
class PointsAndColors(BaseDoc):
    """
    Document for handling the tensor data of a [`PointCloud3D`][docarray.documents.point_cloud.PointCloud3D] object.

    A PointsAndColors Document can contain:

    - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor)
    containing the points in 3D space information (`PointsAndColors.points`)
    - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor)
    containing the points' color information (`PointsAndColors.colors`)
    """

    points: AnyTensor
    colors: Optional[AnyTensor] = None

    @classmethod
    def validate(
        cls: Type[T],
        value: Union[str, AbstractTensor, Any],
    ) -> T:
        if isinstance(value, (AbstractTensor, np.ndarray)) or (
            torch is not None
            and isinstance(value, torch.Tensor)
            or (tf is not None and isinstance(value, tf.Tensor))
        ):
            value = cls(points=value)

        return super().validate(value)

    def display(self) -> None:
        """
        Plot point cloud consisting of points in 3D space and optionally colors.
        """
        if TYPE_CHECKING:
            import trimesh
        else:
            trimesh = import_library('trimesh', raise_error=True)
        from IPython.display import display

        colors = (
            self.colors
            if self.colors is not None
            else np.tile(
                np.array([0, 0, 0]),
                (self.points.get_comp_backend().shape(self.points)[0], 1),
            )
        )
        pc = trimesh.points.PointCloud(vertices=self.points, colors=colors)

        s = trimesh.Scene(geometry=pc)
        display(s.show())
dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data
display()

Plot point cloud consisting of points in 3D space and optionally colors.

Source code in docarray/documents/point_cloud/points_and_colors.py
def display(self) -> None:
    """
    Plot point cloud consisting of points in 3D space and optionally colors.
    """
    if TYPE_CHECKING:
        import trimesh
    else:
        trimesh = import_library('trimesh', raise_error=True)
    from IPython.display import display

    colors = (
        self.colors
        if self.colors is not None
        else np.tile(
            np.array([0, 0, 0]),
            (self.points.get_comp_backend().shape(self.points)[0], 1),
        )
    )
    pc = trimesh.points.PointCloud(vertices=self.points, colors=colors)

    s = trimesh.Scene(geometry=pc)
    display(s.show())
from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)
from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)
from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)
json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)
parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )
schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)
summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()
to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')
to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)
to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)
update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

point_cloud_3d

PointCloud3D

Bases: BaseDoc

Document for handling point clouds for 3D data representation.

Point cloud is a representation of a 3D mesh. It is made by repeatedly and uniformly sampling points within the surface of the 3D body. Compared to the mesh representation, the point cloud is a fixed size ndarray of shape (n_samples, 3) and hence easier for deep learning algorithms to handle.

A PointCloud3D Document can contain:

You can use this Document directly:

from docarray.documents import PointCloud3D

# use it directly
pc = PointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
pc.tensors = pc.url.load(samples=100)
# model = MyEmbeddingModel()
# pc.embedding = model(pc.tensors.points)

You can extend this Document:

from docarray.documents import PointCloud3D
from docarray.typing import AnyEmbedding
from typing import Optional


# extend it
class MyPointCloud3D(PointCloud3D):
    second_embedding: Optional[AnyEmbedding] = None


pc = MyPointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
pc.tensors = pc.url.load(samples=100)
# model = MyEmbeddingModel()
# pc.embedding = model(pc.tensors.points)
# pc.second_embedding = model(pc.tensors.colors)

You can use this Document for composition:

from docarray import BaseDoc
from docarray.documents import PointCloud3D, TextDoc


# compose it
class MultiModalDoc(BaseDoc):
    point_cloud: PointCloud3D
    text: TextDoc


mmdoc = MultiModalDoc(
    point_cloud=PointCloud3D(
        url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj'
    ),
    text=TextDoc(text='hello world, how are you doing?'),
)
mmdoc.point_cloud.tensors = mmdoc.point_cloud.url.load(samples=100)

# or
mmdoc.point_cloud.bytes_ = mmdoc.point_cloud.url.load_bytes()

You can display your point cloud from either its url, or its tensors:

from docarray.documents import PointCloud3D

# display from url
pc = PointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
# pc.url.display()

# display from tensors
pc.tensors = pc.url.load(samples=10000)
# pc.tensors.display()
Source code in docarray/documents/point_cloud/point_cloud_3d.py
class PointCloud3D(BaseDoc):
    """
    Document for handling point clouds for 3D data representation.

    Point cloud is a representation of a 3D mesh. It is made by repeatedly and uniformly
    sampling points within the surface of the 3D body. Compared to the mesh
    representation, the point cloud is a fixed size ndarray of shape `(n_samples, 3)` and
    hence easier for deep learning algorithms to handle.

    A PointCloud3D Document can contain:

    - a [`PointCloud3DUrl`][docarray.typing.url.PointCloud3DUrl] (`PointCloud3D.url`)
    - a [`PointsAndColors`][docarray.documents.point_cloud.points_and_colors.PointsAndColors] object (`PointCloud3D.tensors`)
    - an [`AnyEmbedding`](../../../../api_references/typing/tensor/embedding) (`PointCloud3D.embedding`)
    - a `bytes` object (`PointCloud3D.bytes_`)

    You can use this Document directly:

    ```python
    from docarray.documents import PointCloud3D

    # use it directly
    pc = PointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
    pc.tensors = pc.url.load(samples=100)
    # model = MyEmbeddingModel()
    # pc.embedding = model(pc.tensors.points)
    ```

    You can extend this Document:

    ```python
    from docarray.documents import PointCloud3D
    from docarray.typing import AnyEmbedding
    from typing import Optional


    # extend it
    class MyPointCloud3D(PointCloud3D):
        second_embedding: Optional[AnyEmbedding] = None


    pc = MyPointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
    pc.tensors = pc.url.load(samples=100)
    # model = MyEmbeddingModel()
    # pc.embedding = model(pc.tensors.points)
    # pc.second_embedding = model(pc.tensors.colors)
    ```

    You can use this Document for composition:

    ```python
    from docarray import BaseDoc
    from docarray.documents import PointCloud3D, TextDoc


    # compose it
    class MultiModalDoc(BaseDoc):
        point_cloud: PointCloud3D
        text: TextDoc


    mmdoc = MultiModalDoc(
        point_cloud=PointCloud3D(
            url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj'
        ),
        text=TextDoc(text='hello world, how are you doing?'),
    )
    mmdoc.point_cloud.tensors = mmdoc.point_cloud.url.load(samples=100)

    # or
    mmdoc.point_cloud.bytes_ = mmdoc.point_cloud.url.load_bytes()
    ```

    You can display your point cloud from either its url, or its tensors:

    ```python
    from docarray.documents import PointCloud3D

    # display from url
    pc = PointCloud3D(url='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj')
    # pc.url.display()

    # display from tensors
    pc.tensors = pc.url.load(samples=10000)
    # pc.tensors.display()
    ```
    """

    url: Optional[PointCloud3DUrl] = Field(
        description='URL to a file containing point cloud information. Can be remote (web) URL, or a local file path.',
        example='https://people.sc.fsu.edu/~jburkardt/data/obj/al.obj',
        default=None,
    )
    tensors: Optional[PointsAndColors] = Field(
        description='A tensor object of 3D point cloud of type `PointsAndColors`.',
        example=[[0, 0, 1], [1, 0, 1], [0, 1, 1]],
        default=None,
    )
    embedding: Optional[AnyEmbedding] = Field(
        description='Store an embedding: a vector representation of 3D point cloud.',
        example=[1, 1, 1],
        default=None,
    )
    bytes_: Optional[bytes] = Field(
        description='Bytes representation of 3D point cloud.',
        default=None,
    )

    @classmethod
    def _validate(self, value: Union[str, AbstractTensor, Any]) -> Any:
        if isinstance(value, str):
            value = {'url': value}
        elif isinstance(value, (AbstractTensor, np.ndarray)) or (
            torch is not None
            and isinstance(value, torch.Tensor)
            or (tf is not None and isinstance(value, tf.Tensor))
        ):
            value = {'tensors': PointsAndColors(points=value)}

        return value

    if is_pydantic_v2:

        @model_validator(mode='before')
        @classmethod
        def validate_model_before(cls, value):
            return cls._validate(value)

    else:

        @classmethod
        def validate(
            cls: Type[T],
            value: Union[str, AbstractTensor, Any],
        ) -> T:
            return super().validate(cls._validate(value))
dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data
from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)
from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)
from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)
json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)
parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )
schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)
summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()
to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')
to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)
to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)
update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

points_and_colors

PointsAndColors

Bases: BaseDoc

Document for handling the tensor data of a PointCloud3D object.

A PointsAndColors Document can contain:

  • an AnyTensor containing the points in 3D space information (PointsAndColors.points)
  • an AnyTensor containing the points' color information (PointsAndColors.colors)
Source code in docarray/documents/point_cloud/points_and_colors.py
class PointsAndColors(BaseDoc):
    """
    Document for handling the tensor data of a [`PointCloud3D`][docarray.documents.point_cloud.PointCloud3D] object.

    A PointsAndColors Document can contain:

    - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor)
    containing the points in 3D space information (`PointsAndColors.points`)
    - an [`AnyTensor`](../../../../api_references/typing/tensor/tensor)
    containing the points' color information (`PointsAndColors.colors`)
    """

    points: AnyTensor
    colors: Optional[AnyTensor] = None

    @classmethod
    def validate(
        cls: Type[T],
        value: Union[str, AbstractTensor, Any],
    ) -> T:
        if isinstance(value, (AbstractTensor, np.ndarray)) or (
            torch is not None
            and isinstance(value, torch.Tensor)
            or (tf is not None and isinstance(value, tf.Tensor))
        ):
            value = cls(points=value)

        return super().validate(value)

    def display(self) -> None:
        """
        Plot point cloud consisting of points in 3D space and optionally colors.
        """
        if TYPE_CHECKING:
            import trimesh
        else:
            trimesh = import_library('trimesh', raise_error=True)
        from IPython.display import display

        colors = (
            self.colors
            if self.colors is not None
            else np.tile(
                np.array([0, 0, 0]),
                (self.points.get_comp_backend().shape(self.points)[0], 1),
            )
        )
        pc = trimesh.points.PointCloud(vertices=self.points, colors=colors)

        s = trimesh.Scene(geometry=pc)
        display(s.show())
dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data
display()

Plot point cloud consisting of points in 3D space and optionally colors.

Source code in docarray/documents/point_cloud/points_and_colors.py
def display(self) -> None:
    """
    Plot point cloud consisting of points in 3D space and optionally colors.
    """
    if TYPE_CHECKING:
        import trimesh
    else:
        trimesh = import_library('trimesh', raise_error=True)
    from IPython.display import display

    colors = (
        self.colors
        if self.colors is not None
        else np.tile(
            np.array([0, 0, 0]),
            (self.points.get_comp_backend().shape(self.points)[0], 1),
        )
    )
    pc = trimesh.points.PointCloud(vertices=self.points, colors=colors)

    s = trimesh.Scene(geometry=pc)
    display(s.show())
from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)
from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)
from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)
json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)
parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )
schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)
summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()
to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')
to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)
to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)
update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

text

TextDoc

Bases: BaseDoc

Document for handling text.

It can contain:

  • a TextUrl (TextDoc.url)
  • a str (TextDoc.text)
  • an AnyEmbedding (TextDoc.embedding)
  • a bytes object (TextDoc.bytes_)

You can use this Document directly:

from docarray.documents import TextDoc

# use it directly
txt_doc = TextDoc(url='https://www.gutenberg.org/files/1065/1065-0.txt')
txt_doc.text = txt_doc.url.load()
# model = MyEmbeddingModel()
# txt_doc.embedding = model(txt_doc.text)

You can initialize directly from a string:

from docarray.documents import TextDoc

txt_doc = TextDoc('hello world')

You can extend this Document:

from docarray.documents import TextDoc
from docarray.typing import AnyEmbedding
from typing import Optional


# extend it
class MyText(TextDoc):
    second_embedding: Optional[AnyEmbedding] = None


txt_doc = MyText(url='https://www.gutenberg.org/files/1065/1065-0.txt')
txt_doc.text = txt_doc.url.load()
# model = MyEmbeddingModel()
# txt_doc.embedding = model(txt_doc.text)
# txt_doc.second_embedding = model(txt_doc.text)

You can use this Document for composition:

from docarray import BaseDoc
from docarray.documents import ImageDoc, TextDoc


# compose it
class MultiModalDoc(BaseDoc):
    image_doc: ImageDoc
    text_doc: TextDoc


mmdoc = MultiModalDoc(
    image_doc=ImageDoc(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true'
    ),
    text_doc=TextDoc(text='hello world, how are you doing?'),
)
mmdoc.image_doc.tensor = mmdoc.image_doc.url.load()

# or
mmdoc.image_doc.bytes_ = mmdoc.image_doc.url.load_bytes()
mmdoc.image_doc.tensor = mmdoc.image_doc.bytes_.load()

This Document can be compared against another Document of the same type or a string. When compared against another object of the same type, the pydantic BaseModel equality check will apply which checks the equality of every attribute, excluding id. When compared against a str, it will check the equality of the text attribute against the given string.

from docarray.documents import TextDoc

doc = TextDoc(text='This is the main text', url='exampleurl.com/file')
doc2 = TextDoc(text='This is the main text', url='exampleurl.com/file')

doc == 'This is the main text'  # True
doc == doc2  # True
Source code in docarray/documents/text.py
class TextDoc(BaseDoc):
    """
    Document for handling text.

    It can contain:

    - a [`TextUrl`][docarray.typing.url.TextUrl] (`TextDoc.url`)
    - a `str` (`TextDoc.text`)
    - an [`AnyEmbedding`](../../../api_references/typing/tensor/embedding) (`TextDoc.embedding`)
    - a `bytes` object (`TextDoc.bytes_`)

    You can use this Document directly:

    ```python
    from docarray.documents import TextDoc

    # use it directly
    txt_doc = TextDoc(url='https://www.gutenberg.org/files/1065/1065-0.txt')
    txt_doc.text = txt_doc.url.load()
    # model = MyEmbeddingModel()
    # txt_doc.embedding = model(txt_doc.text)
    ```

    You can initialize directly from a string:

    ```python
    from docarray.documents import TextDoc

    txt_doc = TextDoc('hello world')
    ```

    You can extend this Document:

    ```python
    from docarray.documents import TextDoc
    from docarray.typing import AnyEmbedding
    from typing import Optional


    # extend it
    class MyText(TextDoc):
        second_embedding: Optional[AnyEmbedding] = None


    txt_doc = MyText(url='https://www.gutenberg.org/files/1065/1065-0.txt')
    txt_doc.text = txt_doc.url.load()
    # model = MyEmbeddingModel()
    # txt_doc.embedding = model(txt_doc.text)
    # txt_doc.second_embedding = model(txt_doc.text)
    ```

    You can use this Document for composition:

    ```python
    from docarray import BaseDoc
    from docarray.documents import ImageDoc, TextDoc


    # compose it
    class MultiModalDoc(BaseDoc):
        image_doc: ImageDoc
        text_doc: TextDoc


    mmdoc = MultiModalDoc(
        image_doc=ImageDoc(
            url='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true'
        ),
        text_doc=TextDoc(text='hello world, how are you doing?'),
    )
    mmdoc.image_doc.tensor = mmdoc.image_doc.url.load()

    # or
    mmdoc.image_doc.bytes_ = mmdoc.image_doc.url.load_bytes()
    mmdoc.image_doc.tensor = mmdoc.image_doc.bytes_.load()
    ```

    This Document can be compared against another Document of the same type or a string.
    When compared against another object of the same type, the pydantic BaseModel
    equality check will apply which checks the equality of every attribute,
    excluding `id`. When compared against a str, it will check the equality
    of the `text` attribute against the given string.

    ```python
    from docarray.documents import TextDoc

    doc = TextDoc(text='This is the main text', url='exampleurl.com/file')
    doc2 = TextDoc(text='This is the main text', url='exampleurl.com/file')

    doc == 'This is the main text'  # True
    doc == doc2  # True
    ```

    """

    text: Optional[str] = Field(
        description='The text content stored in the document',
        example='This is an example text content of the document',
        default=None,
    )
    url: Optional[TextUrl] = Field(
        description='URL to a (potentially remote) text file that can be loaded',
        example='https://www.w3.org/History/19921103-hypertext/hypertext/README.html',
        default=None,
    )
    embedding: Optional[AnyEmbedding] = Field(
        description='Store an embedding: a vector representation of the text',
        example=[1, 0, 1],
        default=None,
    )
    bytes_: Optional[bytes] = Field(
        description='Bytes representation of the text',
        default=None,
    )

    def __init__(self, text: Optional[str] = None, **kwargs):
        if 'text' not in kwargs:
            kwargs['text'] = text
        super().__init__(**kwargs)

    if is_pydantic_v2:

        @model_validator(mode='before')
        @classmethod
        def validate_model_before(cls, values):
            if isinstance(values, str):
                return {'text': values}
            else:
                return values

    else:

        @classmethod
        def validate(
            cls: Type[T],
            value: Union[str, Any],
        ) -> T:
            if isinstance(value, str):
                value = cls(text=value)
            return super().validate(value)

    def __eq__(self, other: Any) -> bool:
        if isinstance(other, str):
            return self.text == other
        else:
            # BaseModel has a default equality
            return super().__eq__(other)

    def __contains__(self, item: str) -> bool:
        """
        This method makes `TextDoc` behave the same as an `str`.

        :param item: A string to be checked if is a substring of `text` attribute
        :return: A boolean determining the presence of `item` as a substring in `text`

        ```python
        from docarray.documents import TextDoc

        t = TextDoc(text='this is my text document')
        assert 'text' in t
        assert 'docarray' not in t
        ```
        """
        if self.text is not None:
            return self.text.__contains__(item)
        else:
            return False

    def _get_string_for_regex_filter(self):
        return self.text
__contains__(item)

This method makes TextDoc behave the same as an str.

Parameters:

Name Type Description Default
item str

A string to be checked if is a substring of text attribute

required

Returns:

Type Description
bool

A boolean determining the presence of item as a substring in text python from docarray.documents import TextDoc t = TextDoc(text='this is my text document') assert 'text' in t assert 'docarray' not in t

Source code in docarray/documents/text.py
def __contains__(self, item: str) -> bool:
    """
    This method makes `TextDoc` behave the same as an `str`.

    :param item: A string to be checked if is a substring of `text` attribute
    :return: A boolean determining the presence of `item` as a substring in `text`

    ```python
    from docarray.documents import TextDoc

    t = TextDoc(text='this is my text document')
    assert 'text' in t
    assert 'docarray' not in t
    ```
    """
    if self.text is not None:
        return self.text.__contains__(item)
    else:
        return False
dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data
from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)
from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)
from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)
json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)
parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )
schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)
summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()
to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')
to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)
to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)
update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)

video

VideoDoc

Bases: BaseDoc

Document for handling video.

The Video Document can contain:

You can use this Document directly:

from docarray.documents import VideoDoc, AudioDoc

# use it directly
vid = VideoDoc(
    url='https://github.com/docarray/docarray/blob/main/tests/toydata/mov_bbb.mp4?raw=true'
)
tensor, audio_tensor, key_frame_indices = vid.url.load()
vid.tensor = tensor
vid.audio = AudioDoc(tensor=audio_tensor)
vid.key_frame_indices = key_frame_indices
# model = MyEmbeddingModel()
# vid.embedding = model(vid.tensor)

You can extend this Document:

from typing import Optional

from docarray.documents import TextDoc, VideoDoc


# extend it
class MyVideo(VideoDoc):
    name: Optional[TextDoc] = None


video = MyVideo(
    url='https://github.com/docarray/docarray/blob/main/tests/toydata/mov_bbb.mp4?raw=true'
)
video.name = TextDoc(text='my first video')
video.tensor = video.url.load().video
# model = MyEmbeddingModel()
# video.embedding = model(video.tensor)

You can use this Document for composition:

from docarray import BaseDoc
from docarray.documents import TextDoc, VideoDoc


# compose it
class MultiModalDoc(BaseDoc):
    video: VideoDoc
    text: TextDoc


mmdoc = MultiModalDoc(
    video=VideoDoc(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/mov_bbb.mp4?raw=true'
    ),
    text=TextDoc(text='hello world, how are you doing?'),
)
mmdoc.video.tensor = mmdoc.video.url.load().video

# or
mmdoc.video.bytes_ = mmdoc.video.url.load_bytes()
mmdoc.video.tensor = mmdoc.video.bytes_.load().video
Source code in docarray/documents/video.py
class VideoDoc(BaseDoc):
    """
    Document for handling video.

    The Video Document can contain:

    - a [`VideoUrl`][docarray.typing.url.VideoUrl] (`VideoDoc.url`)
    - an [`AudioDoc`][docarray.documents.AudioDoc] (`VideoDoc.audio`)
    - a [`VideoTensor`](../../../api_references/typing/tensor/video) (`VideoDoc.tensor`)
    - an [`AnyTensor`](../../../api_references/typing/tensor/tensor) representing the indices of the video's key frames (`VideoDoc.key_frame_indices`)
    - an [`AnyEmbedding`](../../../api_references/typing/tensor/embedding) (`VideoDoc.embedding`)
    - a [`VideoBytes`][docarray.typing.bytes.VideoBytes] object (`VideoDoc.bytes_`)

    You can use this Document directly:

    ```python
    from docarray.documents import VideoDoc, AudioDoc

    # use it directly
    vid = VideoDoc(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/mov_bbb.mp4?raw=true'
    )
    tensor, audio_tensor, key_frame_indices = vid.url.load()
    vid.tensor = tensor
    vid.audio = AudioDoc(tensor=audio_tensor)
    vid.key_frame_indices = key_frame_indices
    # model = MyEmbeddingModel()
    # vid.embedding = model(vid.tensor)
    ```

    You can extend this Document:

    ```python
    from typing import Optional

    from docarray.documents import TextDoc, VideoDoc


    # extend it
    class MyVideo(VideoDoc):
        name: Optional[TextDoc] = None


    video = MyVideo(
        url='https://github.com/docarray/docarray/blob/main/tests/toydata/mov_bbb.mp4?raw=true'
    )
    video.name = TextDoc(text='my first video')
    video.tensor = video.url.load().video
    # model = MyEmbeddingModel()
    # video.embedding = model(video.tensor)
    ```

    You can use this Document for composition:

    ```python
    from docarray import BaseDoc
    from docarray.documents import TextDoc, VideoDoc


    # compose it
    class MultiModalDoc(BaseDoc):
        video: VideoDoc
        text: TextDoc


    mmdoc = MultiModalDoc(
        video=VideoDoc(
            url='https://github.com/docarray/docarray/blob/main/tests/toydata/mov_bbb.mp4?raw=true'
        ),
        text=TextDoc(text='hello world, how are you doing?'),
    )
    mmdoc.video.tensor = mmdoc.video.url.load().video

    # or
    mmdoc.video.bytes_ = mmdoc.video.url.load_bytes()
    mmdoc.video.tensor = mmdoc.video.bytes_.load().video
    ```
    """

    url: Optional[VideoUrl] = Field(
        description='URL to a (potentially remote) video file that needs to be loaded',
        example='https://github.com/docarray/docarray/blob/main/tests/toydata/mov_bbb.mp4?raw=true',
        default=None,
    )
    audio: Optional[AudioDoc] = Field(
        description='Audio document associated with the video',
        default=None,
    )
    tensor: Optional[VideoTensor] = Field(
        description='Tensor object representing the video which be specified to one of `VideoNdArray`, `VideoTorchTensor`, `VideoTensorFlowTensor`',
        default=None,
    )
    key_frame_indices: Optional[AnyTensor] = Field(
        description='List of all the key frames in the video',
        example=[0, 1, 2, 3, 4],
        default=None,
    )
    embedding: Optional[AnyEmbedding] = Field(
        description='Store an embedding: a vector representation of the video',
        example=[1, 0, 1],
        default=None,
    )
    bytes_: Optional[VideoBytes] = Field(
        description='Bytes representation of the video',
        default=None,
    )

    @classmethod
    def _validate(cls, value) -> Dict[str, Any]:
        if isinstance(value, str):
            value = dict(url=value)
        elif isinstance(value, (AbstractTensor, np.ndarray)) or (
            torch is not None
            and isinstance(value, torch.Tensor)
            or (tf is not None and isinstance(value, tf.Tensor))
        ):
            value = dict(tensor=value)

        return value

    if is_pydantic_v2:

        @model_validator(mode='before')
        @classmethod
        def validate_model_before(cls, value):
            return cls._validate(value)

    else:

        @classmethod
        def validate(
            cls: Type[T],
            value: Union[str, AbstractTensor, Any],
        ) -> T:
            return super().validate(cls._validate(value))
dict(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False)

Generate a dictionary representation of the model, optionally specifying which fields to include or exclude.

Source code in docarray/base_doc/doc.py
def dict(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
) -> 'DictStrAny':
    """
    Generate a dictionary representation of the model, optionally specifying
    which fields to include or exclude.

    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_doclist(
        exclude=exclude
    )

    data = super().dict(
        include=include,
        exclude=exclude,
        by_alias=by_alias,
        skip_defaults=skip_defaults,
        exclude_unset=exclude_unset,
        exclude_defaults=exclude_defaults,
        exclude_none=exclude_none,
    )

    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            val = getattr(self, field)
            data[field] = (
                [doc.dict() for doc in val] if val is not None else None
            )

    return data
from_base64(data, protocol='pickle', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data str

a base64 encoded string

required
protocol Literal['pickle', 'protobuf']

protocol to use. It can be 'pickle' or 'protobuf'

'pickle'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_base64(
    cls: Type[T],
    data: str,
    protocol: Literal['pickle', 'protobuf'] = 'pickle',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: a base64 encoded string
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    return cls.from_bytes(base64.b64decode(data), protocol, compress)
from_bytes(data, protocol='protobuf', compress=None) classmethod

Build Document object from binary bytes

Parameters:

Name Type Description Default
data bytes

binary bytes

required
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_bytes(
    cls: Type[T],
    data: bytes,
    protocol: ProtocolType = 'protobuf',
    compress: Optional[str] = None,
) -> T:
    """Build Document object from binary bytes

    :param data: binary bytes
    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a Document object
    """
    bstr = _decompress_bytes(data, algorithm=compress)
    if protocol == 'pickle':
        return pickle.loads(bstr)
    elif protocol == 'protobuf':
        from docarray.proto import DocProto

        pb_msg = DocProto()
        pb_msg.ParseFromString(bstr)
        return cls.from_protobuf(pb_msg)
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
from_json(data) classmethod

Build Document object from json data

Returns:

Type Description
T

a Document object

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_json(
    cls: Type[T],
    data: str,
) -> T:
    """Build Document object from json data
    :return: a Document object
    """
    # TODO: add tests

    if is_pydantic_v2:
        return cls.model_validate_json(data)
    else:
        return cls.parse_raw(data)
from_protobuf(pb_msg) classmethod

create a Document from a protobuf message

Parameters:

Name Type Description Default
pb_msg DocProto

the proto message of the Document

required

Returns:

Type Description
T

a Document initialize with the proto data

Source code in docarray/base_doc/mixins/io.py
@classmethod
def from_protobuf(cls: Type[T], pb_msg: 'DocProto') -> T:
    """create a Document from a protobuf message

    :param pb_msg: the proto message of the Document
    :return: a Document initialize with the proto data
    """

    fields: Dict[str, Any] = {}
    load_extra_field = (
        cls.model_config['_load_extra_fields_from_protobuf']
        if is_pydantic_v2
        else cls.Config._load_extra_fields_from_protobuf
    )
    for field_name in pb_msg.data:
        if (
            not (load_extra_field)
            and field_name not in cls._docarray_fields().keys()
        ):
            continue  # optimization we don't even load the data if the key does not
            # match any field in the cls or in the mapping

        fields[field_name] = cls._get_content_from_node_proto(
            pb_msg.data[field_name], field_name
        )

    return cls(**fields)
json(*, include=None, exclude=None, by_alias=False, skip_defaults=None, exclude_unset=False, exclude_defaults=False, exclude_none=False, encoder=None, models_as_dict=True, **dumps_kwargs)

Generate a JSON representation of the model, include and exclude arguments as per dict().

encoder is an optional function to supply as default to json.dumps(), other arguments as per json.dumps().

Source code in docarray/base_doc/doc.py
def json(
    self,
    *,
    include: Optional[Union['AbstractSetIntStr', 'MappingIntStrAny']] = None,
    exclude: ExcludeType = None,
    by_alias: bool = False,
    skip_defaults: Optional[bool] = None,
    exclude_unset: bool = False,
    exclude_defaults: bool = False,
    exclude_none: bool = False,
    encoder: Optional[Callable[[Any], Any]] = None,
    models_as_dict: bool = True,
    **dumps_kwargs: Any,
) -> str:
    """
    Generate a JSON representation of the model, `include` and `exclude`
    arguments as per `dict()`.

    `encoder` is an optional function to supply as `default` to json.dumps(),
    other arguments as per `json.dumps()`.
    """
    exclude, original_exclude, doclist_exclude_fields = self._exclude_docarray(
        exclude=exclude
    )

    # this is copy from pydantic code
    if skip_defaults is not None:
        warnings.warn(
            f'{self.__class__.__name__}.json(): "skip_defaults" is deprecated and replaced by "exclude_unset"',
            DeprecationWarning,
        )
        exclude_unset = skip_defaults
    encoder = cast(Callable[[Any], Any], encoder or self.__json_encoder__)

    # We don't directly call `self.dict()`, which does exactly this with `to_dict=True`
    # because we want to be able to keep raw `BaseModel` instances and not as `dict`.
    # This allows users to write custom JSON encoders for given `BaseModel` classes.
    data = dict(
        self._iter(
            to_dict=models_as_dict,
            by_alias=by_alias,
            include=include,
            exclude=exclude,
            exclude_unset=exclude_unset,
            exclude_defaults=exclude_defaults,
            exclude_none=exclude_none,
        )
    )

    # this is the custom part to deal with DocList
    for field in doclist_exclude_fields:
        # we need to do this because pydantic will not recognize DocList correctly
        original_exclude = original_exclude or {}
        if field not in original_exclude:
            data[field] = getattr(
                self, field
            )  # here we need to keep doclist as doclist otherwise if a user want to have a special json config it will not work

    # this is copy from pydantic code
    if self.__custom_root_type__:
        data = data[ROOT_KEY]
    return self.__config__.json_dumps(data, default=encoder, **dumps_kwargs)
parse_raw(b, *, content_type=None, encoding='utf8', proto=None, allow_pickle=False) classmethod

Parse a raw string or bytes into a base doc

Parameters:

Name Type Description Default
b StrBytes
required
content_type str
None
encoding str

the encoding to use when parsing a string, defaults to 'utf8'

'utf8'
proto Protocol

protocol to use.

None
allow_pickle bool

allow pickle protocol

False

Returns:

Type Description
T

a document

Source code in docarray/base_doc/doc.py
@no_type_check
@classmethod
def parse_raw(
    cls: Type[T],
    b: 'StrBytes',
    *,
    content_type: str = None,
    encoding: str = 'utf8',
    proto: 'Protocol' = None,
    allow_pickle: bool = False,
) -> T:
    """
    Parse a raw string or bytes into a base doc
    :param b:
    :param content_type:
    :param encoding: the encoding to use when parsing a string, defaults to 'utf8'
    :param proto: protocol to use.
    :param allow_pickle: allow pickle protocol
    :return: a document
    """
    return super(BaseDocWithoutId, cls).parse_raw(
        b,
        content_type=content_type,
        encoding=encoding,
        proto=proto,
        allow_pickle=allow_pickle,
    )
schema_summary() classmethod

Print a summary of the Documents schema.

Source code in docarray/base_doc/doc.py
@classmethod
def schema_summary(cls) -> None:
    """Print a summary of the Documents schema."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary.schema_summary(cls)
summary()

Print non-empty fields and nested structure of this Document object.

Source code in docarray/base_doc/doc.py
def summary(self) -> None:
    """Print non-empty fields and nested structure of this Document object."""
    from docarray.display.document_summary import DocumentSummary

    DocumentSummary(doc=self).summary()
to_base64(protocol='protobuf', compress=None)

Serialize a Document object into as base64 string

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compress method to use

None

Returns:

Type Description
str

a base64 encoded string

Source code in docarray/base_doc/mixins/io.py
def to_base64(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> str:
    """Serialize a Document object into as base64 string

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compress method to use
    :return: a base64 encoded string
    """
    return base64.b64encode(self.to_bytes(protocol, compress)).decode('utf-8')
to_bytes(protocol='protobuf', compress=None)

Serialize itself into bytes.

For more Pythonic code, please use bytes(...).

Parameters:

Name Type Description Default
protocol ProtocolType

protocol to use. It can be 'pickle' or 'protobuf'

'protobuf'
compress Optional[str]

compression algorithm to use

None

Returns:

Type Description
bytes

the binary serialization in bytes

Source code in docarray/base_doc/mixins/io.py
def to_bytes(
    self, protocol: ProtocolType = 'protobuf', compress: Optional[str] = None
) -> bytes:
    """Serialize itself into bytes.

    For more Pythonic code, please use ``bytes(...)``.

    :param protocol: protocol to use. It can be 'pickle' or 'protobuf'
    :param compress: compression algorithm to use
    :return: the binary serialization in bytes
    """
    import pickle

    if protocol == 'pickle':
        bstr = pickle.dumps(self)
    elif protocol == 'protobuf':
        bstr = self.to_protobuf().SerializePartialToString()
    else:
        raise ValueError(
            f'protocol={protocol} is not supported. Can be only `protobuf` or '
            f'pickle protocols 0-5.'
        )
    return _compress_bytes(bstr, algorithm=compress)
to_protobuf()

Convert Document into a Protobuf message.

Returns:

Type Description
DocProto

the protobuf message

Source code in docarray/base_doc/mixins/io.py
def to_protobuf(self: T) -> 'DocProto':
    """Convert Document into a Protobuf message.

    :return: the protobuf message
    """
    from docarray.proto import DocProto

    data = {}
    for field, value in self:
        try:
            data[field] = _type_to_protobuf(value)
        except RecursionError as ex:
            if len(ex.args) >= 1:
                ex.args = (
                    (
                        f'Field `{field}` contains cyclic reference in memory. '
                        'Could it be your Document is referring to itself?'
                    ),
                )
            raise ex
        except Exception as ex:
            if len(ex.args) >= 1:
                ex.args = (f'Field `{field}` is problematic',) + ex.args
            raise ex

    return DocProto(data=data)
update(other)

Updates self with the content of other. Changes are applied to self. Updating one Document with another consists in the following:

  • Setting data properties of the second Document to the first Document if they are not None
  • Concatenating lists and updating sets
  • Updating recursively Documents and DocLists
  • Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since it is applied to a static schema type, the presence of the field is given by the field not having a None value and that DocLists, lists and sets are concatenated. It is worth mentioning that Tuples are not merged together since they are meant to be immutable, so they behave as regular types and the value of self is updated with the value of other.


from typing import List, Optional

from docarray import BaseDoc


class MyDocument(BaseDoc):
    content: str
    title: Optional[str] = None
    tags_: List


doc1 = MyDocument(
    content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

Parameters:

Name Type Description Default
other T

The Document with which to update the contents of this

required
Source code in docarray/base_doc/mixins/update.py
def update(self, other: T):
    """
    Updates self with the content of other. Changes are applied to self.
    Updating one Document with another consists in the following:

     - Setting data properties of the second Document to the first Document
     if they are not None
     - Concatenating lists and updating sets
     - Updating recursively Documents and DocLists
     - Updating Dictionaries of the left with the right

    It behaves as an update operation for Dictionaries, except that since
    it is applied to a static schema type, the presence of the field is
    given by the field not having a None value and that DocLists,
    lists and sets are concatenated. It is worth mentioning that Tuples
    are not merged together since they are meant to be immutable,
    so they behave as regular types and the value of `self` is updated
    with the value of `other`.


    ---

    ```python
    from typing import List, Optional

    from docarray import BaseDoc


    class MyDocument(BaseDoc):
        content: str
        title: Optional[str] = None
        tags_: List


    doc1 = MyDocument(
        content='Core content of the document', title='Title', tags_=['python', 'AI']
    )
    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

    doc1.update(doc2)
    assert doc1.content == 'Core content updated'
    assert doc1.title == 'Title'
    assert doc1.tags_ == ['python', 'AI', 'docarray']
    ```

    ---
    :param other: The Document with which to update the contents of this
    """
    if not _similar_schemas(self, other):
        raise Exception(
            f'Update operation can only be applied to '
            f'Documents of the same schema. '
            f'Trying to update Document of type '
            f'{type(self)} with Document of type '
            f'{type(other)}'
        )
    from collections import namedtuple

    from docarray import DocList
    from docarray.utils.reduce import reduce

    # Declaring namedtuple()
    _FieldGroups = namedtuple(
        '_FieldGroups',
        [
            'simple_non_empty_fields',
            'list_fields',
            'set_fields',
            'dict_fields',
            'nested_docarray_fields',
            'nested_docs_fields',
        ],
    )

    FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

    def _group_fields(doc: 'UpdateMixin') -> _FieldGroups:
        simple_non_empty_fields: List[str] = []
        list_fields: List[str] = []
        set_fields: List[str] = []
        dict_fields: List[str] = []
        nested_docs_fields: List[str] = []
        nested_docarray_fields: List[str] = []

        for field_name, field in doc._docarray_fields().items():
            if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
                field_type = doc._get_field_annotation(field_name)

                if isinstance(field_type, type) and safe_issubclass(
                    field_type, DocList
                ):
                    nested_docarray_fields.append(field_name)
                else:
                    origin = get_origin(field_type)
                    if origin is list:
                        list_fields.append(field_name)
                    elif origin is set:
                        set_fields.append(field_name)
                    elif origin is dict:
                        dict_fields.append(field_name)
                    else:
                        v = getattr(doc, field_name)
                        if v is not None:
                            if isinstance(v, UpdateMixin):
                                nested_docs_fields.append(field_name)
                            else:
                                simple_non_empty_fields.append(field_name)
        return _FieldGroups(
            simple_non_empty_fields,
            list_fields,
            set_fields,
            dict_fields,
            nested_docarray_fields,
            nested_docs_fields,
        )

    doc1_fields = _group_fields(self)
    doc2_fields = _group_fields(other)

    for field in doc2_fields.simple_non_empty_fields:
        setattr(self, field, getattr(other, field))

    for field in set(
        doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
    ):
        sub_doc_1: T = getattr(self, field)
        sub_doc_2: T = getattr(other, field)
        sub_doc_1.update(sub_doc_2)
        setattr(self, field, sub_doc_1)

    for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.extend(array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1.update(array2)
            setattr(self, field, array1)

    for field in set(
        doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
    ):
        array1 = getattr(self, field)
        array2 = getattr(other, field)
        if array1 is None and array2 is not None:
            setattr(self, field, array2)
        elif array1 is not None and array2 is not None:
            array1 = reduce(array1, array2)
            setattr(self, field, array1)

    for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
        dict1 = getattr(self, field)
        dict2 = getattr(other, field)
        if dict1 is None and dict2 is not None:
            setattr(self, field, dict2)
        elif dict1 is not None and dict2 is not None:
            dict1.update(dict2)
            setattr(self, field, dict1)