Skip to content



Bases: Sequence[T_doc], Generic[T_doc], AbstractType

Source code in docarray/array/
class AnyDocArray(Sequence[T_doc], Generic[T_doc], AbstractType):
    doc_type: Type[BaseDocWithoutId]
    __typed_da__: Dict[Type['AnyDocArray'], Dict[Type[BaseDocWithoutId], Type]] = {}

    def __repr__(self):
        return f'<{self.__class__.__name__} (length={len(self)})>'

    def __class_getitem__(cls, item: Union[Type[BaseDocWithoutId], TypeVar, str]):
        if not isinstance(item, type):
            if sys.version_info < (3, 12):
                return Generic.__class_getitem__.__func__(cls, item)  # type: ignore
                # this do nothing that checking that item is valid type var or str
                # Keep the approach in #1147 to be compatible with lower versions of Python.
                return GenericAlias(cls, item)  # type: ignore
        if not safe_issubclass(item, BaseDocWithoutId):
            raise ValueError(
                f'{cls.__name__}[item] item should be a Document not a {item} '

        if cls not in cls.__typed_da__:
            cls.__typed_da__[cls] = {}

        if item not in cls.__typed_da__[cls]:
            # Promote to global scope so multiprocessing can pickle it
            global _DocArrayTyped

            class _DocArrayTyped(cls):  # type: ignore
                doc_type: Type[BaseDocWithoutId] = cast(Type[BaseDocWithoutId], item)

            for field in _DocArrayTyped.doc_type._docarray_fields().keys():

                def _property_generator(val: str):
                    def _getter(self):
                        if getattr(self, '_is_unusable', False):
                            raise UnusableObjectError(
                        return self._get_data_column(val)

                    def _setter(self, value):
                        if getattr(self, '_is_unusable', False):
                            raise UnusableObjectError(
                        self._set_data_column(val, value)

                    # need docstring for the property
                    return property(fget=_getter, fset=_setter)

                setattr(_DocArrayTyped, field, _property_generator(field))
                # this generates property on the fly based on the schema of the item

            # The global scope and qualname need to refer to this class a unique name.
            # Otherwise, creating another _DocArrayTyped will overwrite this one.
                _DocArrayTyped, f'{cls.__name__}[{item.__name__}]', globals()

            cls.__typed_da__[cls][item] = _DocArrayTyped

        return cls.__typed_da__[cls][item]

    def __getitem__(self: T, item: int) -> T_doc:

    def __getitem__(self: T, item: IndexIterType) -> T:

    def __getitem__(self, item: Union[int, IndexIterType]) -> Union[T_doc, T]:

    def __getattr__(self, item: str):
        # Needs to be explicitly defined here for the purpose to disable PyCharm's complaints
        # about not detected properties:
        return super().__getattribute__(item)

    def _get_data_column(
        self: T,
        field: str,
    ) -> Union[MutableSequence, T, 'AbstractTensor', None]:
        """Return all values of the fields from all docs this array contains

        :param field: name of the fields to extract
        :return: Returns a list of the field value for each document
        in the array like container

    def _set_data_column(
        self: T,
        field: str,
        values: Union[List, T, 'AbstractTensor'],
        """Set all Documents in this [`DocList`][docarray.array.doc_list.doc_list.DocList] using the passed values

        :param field: name of the fields to extract
        :values: the values to set at the DocList level

    def from_protobuf(cls: Type[T], pb_msg: 'DocListProto') -> T:
        """create a Document from a protobuf message"""

    def to_protobuf(self) -> 'DocListProto':
        """Convert DocList into a Protobuf message"""

    def _to_node_protobuf(self) -> 'NodeProto':
        """Convert a [`DocList`][docarray.array.doc_list.doc_list.DocList] into a NodeProto
        protobuf message.
        This function should be called when a DocList is nested into
        another Document that need to be converted into a protobuf.

        :return: the nested item protobuf message
        from docarray.proto import NodeProto

        return NodeProto(doc_array=self.to_protobuf())

    def traverse_flat(
        self: 'AnyDocArray',
        access_path: str,
    ) -> Union[List[Any], 'AbstractTensor']:
        Return a List of the accessed objects when applying the `access_path`. If this
        results in a nested list or list of [`DocList`s][docarray.array.doc_list.doc_list.DocList], the list will be flattened
        on the first level. The access path is a string that consists of attribute
        names, concatenated and `"__"`-separated. It describes the path from the first
        level to an arbitrary one, e.g. `'content__image__url'`.

        from docarray import BaseDoc, DocList, Text

        class Author(BaseDoc):
            name: str

        class Book(BaseDoc):
            author: Author
            content: Text

        docs = DocList[Book](
            Book(author=Author(name='Jenny'), content=Text(text=f'book_{i}'))
            for i in range(10)  # noqa: E501

        books = docs.traverse_flat(access_path='content')  # list of 10 Text objs

        authors = docs.traverse_flat(access_path='author__name')  # list of 10 strings

        If the resulting list is a nested list, it will be flattened:

        from docarray import BaseDoc, DocList

        class Chapter(BaseDoc):
            content: str

        class Book(BaseDoc):
            chapters: DocList[Chapter]

        docs = DocList[Book](
            Book(chapters=DocList[Chapter]([Chapter(content='some_content') for _ in range(3)]))
            for _ in range(10)

        chapters = docs.traverse_flat(access_path='chapters')  # list of 30 strings

        If your [`DocList`][docarray.array.doc_list.doc_list.DocList] is in doc_vec mode, and you want to access a field of
        type `AnyTensor`, the doc_vec tensor will be returned instead of a list:

        class Image(BaseDoc):
            tensor: TorchTensor[3, 224, 224]

        batch = DocList[Image](
                    tensor=torch.zeros(3, 224, 224),
                for _ in range(2)

        batch_stacked = batch.stack()
        tensors = batch_stacked.traverse_flat(
        )  # tensor of shape (2, 3, 224, 224)

        :param access_path: a string that represents the access path ("__"-separated).
        :return: list of the accessed objects, flattened if nested.

    def _traverse(node: Any, access_path: str):
        if access_path:
            curr_attr, _, path_attrs = access_path.partition('__')

            from docarray.array import DocList

            if isinstance(node, (DocList, list)):
                for n in node:
                    x = getattr(n, curr_attr)
                    yield from AnyDocArray._traverse(x, path_attrs)
                x = getattr(node, curr_attr)
                yield from AnyDocArray._traverse(x, path_attrs)
            yield node

    def _flatten_one_level(sequence: List[Any]) -> List[Any]:
        from docarray import DocList

        if len(sequence) == 0 or not isinstance(sequence[0], (list, DocList)):
            return sequence
            return [item for sublist in sequence for item in sublist]

    def summary(self):
        Print a summary of this [`DocList`][docarray.array.doc_list.doc_list.DocList] object and a summary of the schema of its
        Document type.

    def _batch(
        self: T,
        batch_size: int,
        shuffle: bool = False,
        show_progress: bool = False,
    ) -> Generator[T, None, None]:
        Creates a `Generator` that yields [`DocList`][docarray.array.doc_list.doc_list.DocList] of size `batch_size`.
        Note, that the last batch might be smaller than `batch_size`.

        :param batch_size: Size of each generated batch.
        :param shuffle: If set, shuffle the Documents before dividing into minibatches.
        :param show_progress: if set, show a progress bar when batching documents.
        :yield: a Generator of [`DocList`][docarray.array.doc_list.doc_list.DocList], each in the length of `batch_size`
        from rich.progress import track

        if not (isinstance(batch_size, int) and batch_size > 0):
            raise ValueError(
                f'`batch_size` should be a positive integer, received: {batch_size}'

        N = len(self)
        indices = list(range(N))
        n_batches = int(np.ceil(N / batch_size))

        if shuffle:

        for i in track(
            description='Batching documents',
            disable=not show_progress,
            yield self[indices[i * batch_size : (i + 1) * batch_size]]

from_protobuf(pb_msg) abstractmethod classmethod

create a Document from a protobuf message

Source code in docarray/array/
def from_protobuf(cls: Type[T], pb_msg: 'DocListProto') -> T:
    """create a Document from a protobuf message"""


Print a summary of this DocList object and a summary of the schema of its Document type.

Source code in docarray/array/
def summary(self):
    Print a summary of this [`DocList`][docarray.array.doc_list.doc_list.DocList] object and a summary of the schema of its
    Document type.

to_protobuf() abstractmethod

Convert DocList into a Protobuf message

Source code in docarray/array/
def to_protobuf(self) -> 'DocListProto':
    """Convert DocList into a Protobuf message"""

traverse_flat(access_path) abstractmethod

Return a List of the accessed objects when applying the access_path. If this results in a nested list or list of DocLists, the list will be flattened on the first level. The access path is a string that consists of attribute names, concatenated and "__"-separated. It describes the path from the first level to an arbitrary one, e.g. 'content__image__url'.

from docarray import BaseDoc, DocList, Text

class Author(BaseDoc):
    name: str

class Book(BaseDoc):
    author: Author
    content: Text

docs = DocList[Book](
    Book(author=Author(name='Jenny'), content=Text(text=f'book_{i}'))
    for i in range(10)  # noqa: E501

books = docs.traverse_flat(access_path='content')  # list of 10 Text objs

authors = docs.traverse_flat(access_path='author__name')  # list of 10 strings

If the resulting list is a nested list, it will be flattened:

from docarray import BaseDoc, DocList

class Chapter(BaseDoc):
    content: str

class Book(BaseDoc):
    chapters: DocList[Chapter]

docs = DocList[Book](
    Book(chapters=DocList[Chapter]([Chapter(content='some_content') for _ in range(3)]))
    for _ in range(10)

chapters = docs.traverse_flat(access_path='chapters')  # list of 30 strings

If your DocList is in doc_vec mode, and you want to access a field of type AnyTensor, the doc_vec tensor will be returned instead of a list:

class Image(BaseDoc):
    tensor: TorchTensor[3, 224, 224]

batch = DocList[Image](
            tensor=torch.zeros(3, 224, 224),
        for _ in range(2)

batch_stacked = batch.stack()
tensors = batch_stacked.traverse_flat(
)  # tensor of shape (2, 3, 224, 224)


Name Type Description Default
access_path str

a string that represents the access path ("__"-separated).



Type Description
Union[List[Any], AbstractTensor]

list of the accessed objects, flattened if nested.

Source code in docarray/array/
def traverse_flat(
    self: 'AnyDocArray',
    access_path: str,
) -> Union[List[Any], 'AbstractTensor']:
    Return a List of the accessed objects when applying the `access_path`. If this
    results in a nested list or list of [`DocList`s][docarray.array.doc_list.doc_list.DocList], the list will be flattened
    on the first level. The access path is a string that consists of attribute
    names, concatenated and `"__"`-separated. It describes the path from the first
    level to an arbitrary one, e.g. `'content__image__url'`.

    from docarray import BaseDoc, DocList, Text

    class Author(BaseDoc):
        name: str

    class Book(BaseDoc):
        author: Author
        content: Text

    docs = DocList[Book](
        Book(author=Author(name='Jenny'), content=Text(text=f'book_{i}'))
        for i in range(10)  # noqa: E501

    books = docs.traverse_flat(access_path='content')  # list of 10 Text objs

    authors = docs.traverse_flat(access_path='author__name')  # list of 10 strings

    If the resulting list is a nested list, it will be flattened:

    from docarray import BaseDoc, DocList

    class Chapter(BaseDoc):
        content: str

    class Book(BaseDoc):
        chapters: DocList[Chapter]

    docs = DocList[Book](
        Book(chapters=DocList[Chapter]([Chapter(content='some_content') for _ in range(3)]))
        for _ in range(10)

    chapters = docs.traverse_flat(access_path='chapters')  # list of 30 strings

    If your [`DocList`][docarray.array.doc_list.doc_list.DocList] is in doc_vec mode, and you want to access a field of
    type `AnyTensor`, the doc_vec tensor will be returned instead of a list:

    class Image(BaseDoc):
        tensor: TorchTensor[3, 224, 224]

    batch = DocList[Image](
                tensor=torch.zeros(3, 224, 224),
            for _ in range(2)

    batch_stacked = batch.stack()
    tensors = batch_stacked.traverse_flat(
    )  # tensor of shape (2, 3, 224, 224)

    :param access_path: a string that represents the access path ("__"-separated).
    :return: list of the accessed objects, flattened if nested.