InMemoryExactNNIndex

`docarray.index.backends.in_memory.InMemoryExactNNIndex`

Bases: BaseDocIndex, Generic[TSchema]

Source code in docarray/index/backends/in_memory.py

class InMemoryExactNNIndex(BaseDocIndex, Generic[TSchema]):
    def __init__(
        self,
        docs: Optional[DocList] = None,
        db_config=None,
        **kwargs,
    ):
        """Initialize InMemoryExactNNIndex"""
        super().__init__(db_config=db_config, **kwargs)
        self._runtime_config = self.RuntimeConfig()
        self._db_config = cast(InMemoryExactNNIndex.DBConfig, self._db_config)
        self._index_file_path = self._db_config.index_file_path

        if docs and self._index_file_path:
            raise ValueError(
                'Initialize `InMemoryExactNNIndex` with either `docs` or '
                '`index_file_path`, not both. Provide `docs` for a fresh index, or '
                '`index_file_path` to use an existing file.'
            )

        if self._index_file_path:
            if os.path.exists(self._index_file_path):
                self._logger.info(
                    f'Loading index from a binary file: {self._index_file_path}'
                )
                self._docs = DocList.__class_getitem__(
                    cast(Type[BaseDoc], self._schema)
                ).load_binary(file=self._index_file_path)

                data_by_columns = self._get_col_value_dict(self._docs)
                self._update_subindex_data(self._docs)
                self._index_subindex(data_by_columns)

            else:
                self._logger.warning(
                    f'Index file does not exist: {self._index_file_path}. '
                    f'Initializing empty InMemoryExactNNIndex.'
                )
                self._docs = DocList.__class_getitem__(
                    cast(Type[BaseDoc], self._schema)
                )()
        else:
            if docs:
                self._logger.info('Docs provided. Initializing with provided docs.')
                self._docs = docs
            else:
                self._logger.info(
                    'No docs or index file provided. Initializing empty InMemoryExactNNIndex.'
                )
                self._docs = DocList.__class_getitem__(
                    cast(Type[BaseDoc], self._schema)
                )()

        self._embedding_map: Dict[str, Tuple[AnyTensor, Optional[List[int]]]] = {}
        self._ids_to_positions: Dict[str, int] = {}

    def python_type_to_db_type(self, python_type: Type) -> Any:
        """Map python type to database type.
        Takes any python type and returns the corresponding database column type.

        :param python_type: a python type.
        :return: the corresponding database column type,
            or None if ``python_type`` is not supported.
        """
        return python_type

    @property
    def out_schema(self) -> Type[BaseDoc]:
        """Return the original schema (without the parent_id from new_schema type)"""
        if self._is_subindex:
            return self._ori_schema
        return cast(Type[BaseDoc], self._schema)

    class QueryBuilder(BaseDocIndex.QueryBuilder):
        def __init__(self, query: Optional[List[Tuple[str, Dict]]] = None):
            super().__init__()
            # list of tuples (method name, kwargs)
            self._queries: List[Tuple[str, Dict]] = query or []

        def build(self, *args, **kwargs) -> Any:
            """Build the query object."""
            return self._queries

        find = _collect_query_args('find')
        find_batched = _collect_query_args('find_batched')
        filter = _collect_query_args('filter')
        filter_batched = _raise_not_supported('find_batched')
        text_search = _raise_not_supported('text_search')
        text_search_batched = _raise_not_supported('text_search')

    @dataclass
    class DBConfig(BaseDocIndex.DBConfig):
        """Dataclass that contains all "static" configurations of InMemoryExactNNIndex."""

        index_file_path: Optional[str] = None
        default_column_config: Dict[Type, Dict[str, Any]] = field(
            default_factory=lambda: defaultdict(
                dict,
                {
                    AbstractTensor: {'space': 'cosine_sim'},
                },
            )
        )

    @dataclass
    class RuntimeConfig(BaseDocIndex.RuntimeConfig):
        """Dataclass that contains all "dynamic" configurations of InMemoryExactNNIndex."""

        pass

    def index(self, docs: Union[BaseDoc, Sequence[BaseDoc]], **kwargs):
        """index Documents into the index.

        !!! note
            Passing a sequence of Documents that is not a DocList
            (such as a List of Docs) comes at a performance penalty.
            This is because the Index needs to check compatibility between itself and
            the data. With a DocList as input this is a single check; for other inputs
            compatibility needs to be checked for every Document individually.

        :param docs: Documents to index.
        """
        # implementing the public option because conversion to column dict is not needed
        docs = self._validate_docs(docs)
        ids_to_positions = self._get_ids_to_positions()
        for doc in docs:
            if doc.id in ids_to_positions:
                self._docs[ids_to_positions[doc.id]] = doc
            else:
                self._docs.append(doc)
                self._ids_to_positions[str(doc.id)] = len(self._ids_to_positions)

        # Add parent_id to all sub-index documents and store sub-index documents
        data_by_columns = self._get_col_value_dict(docs)
        self._update_subindex_data(docs)
        self._index_subindex(data_by_columns)

        self._rebuild_embedding()

    def _index(self, column_to_data: Dict[str, Generator[Any, None, None]]):
        raise NotImplementedError

    def num_docs(self) -> int:
        """
        Get the number of documents.
        """
        return len(self._docs)

    def _rebuild_embedding(self):
        """
        Reconstructs the embeddings map for each field. This is performed to store pre-stacked
        embeddings, thereby optimizing performance by avoiding repeated stacking of embeddings.

        Note: '_embedding_map' is a dictionary mapping fields to their corresponding embeddings.
        """
        if self._is_index_empty:
            self._embedding_map = dict()
        else:
            for field_, embedding in self._embedding_map.items():
                self._embedding_map[field_] = _extract_embeddings(self._docs, field_)

    def _del_items(self, doc_ids: Sequence[str]):
        """Delete Documents from the index.

        :param doc_ids: ids to delete from the Document Store
        """
        for field_, type_, _ in self._flatten_schema(cast(Type[BaseDoc], self._schema)):
            if safe_issubclass(type_, AnyDocArray):
                for id in doc_ids:
                    doc_ = self._get_items([id])
                    if len(doc_) == 0:
                        raise KeyError(
                            f"The document (id = '{id}') does not exist in the ExactNNIndexer."
                        )
                    sub_ids = [sub_doc.id for sub_doc in getattr(doc_[0], field_)]
                    del self._subindices[field_][sub_ids]

        indices = []
        for i, doc in enumerate(self._docs):
            if doc.id in doc_ids:
                indices.append(i)

        del self._docs[indices]
        self._update_ids_to_positions()
        self._rebuild_embedding()

    def _ori_items(self, doc: BaseDoc) -> BaseDoc:
        """
        The Indexer's backend stores parent_id to support nested data. However,
        this method enables us to retrieve the original items in their original
        type, which is what the user interacts with.

        :param doc: The input document in New_Schema format from the Indexer's backend.
        :return: The input document with its original schema.
        """

        ori_doc = _shallow_copy_doc(doc)
        for field_name, type_, _ in self._flatten_schema(
            cast(Type[BaseDoc], self.out_schema)
        ):
            if safe_issubclass(type_, AnyDocArray):
                _list = getattr(ori_doc, field_name)
                for i, nested_doc in enumerate(_list):
                    sub_indexer: InMemoryExactNNIndex = cast(
                        InMemoryExactNNIndex, self._subindices[field_name]
                    )
                    nested_doc = self._subindices[field_name]._ori_schema(
                        **nested_doc.__dict__
                    )

                    _list[i] = sub_indexer._ori_items(nested_doc)

        return ori_doc

    def _get_items(
        self, doc_ids: Sequence[str], raw: bool = False
    ) -> Union[Sequence[TSchema], Sequence[Dict[str, Any]]]:
        """Get Documents from the index, by `id`.
        If no document is found, a KeyError is raised.

        :param doc_ids: ids to get from the Document index
        :param raw: if raw, output the new_schema type (with parent id)
        :return: Sequence of Documents, sorted corresponding to the order of `doc_ids`.
            Duplicate `doc_ids` can be omitted in the output.
        """

        out_docs = []
        ids_to_positions = self._get_ids_to_positions()
        for doc_id in doc_ids:
            if doc_id not in ids_to_positions:
                continue
            doc = self._docs[ids_to_positions[doc_id]]
            if raw:
                out_docs.append(doc)
            else:
                ori_doc = self._ori_items(doc)
                schema_cls = cast(Type[BaseDoc], self.out_schema)
                new_doc = schema_cls(**ori_doc.__dict__)
                out_docs.append(new_doc)

        return out_docs

    def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any:
        """
        Execute a query on the InMemoryExactNNIndex.

        Can take two kinds of inputs:

        1. A native query of the underlying database. This is meant as a passthrough so that you
        can enjoy any functionality that is not available through the Document index API.
        2. The output of this Document index' `QueryBuilder.build()` method.

        :param query: the query to execute
        :param args: positional arguments to pass to the query
        :param kwargs: keyword arguments to pass to the query
        :return: the result of the query
        """
        if args or kwargs:
            raise ValueError(
                f'args and kwargs not supported for `execute_query` on {type(self)}'
            )
        return self._find_and_filter(query)

    def _find_and_filter(self, query: List[Tuple[str, Dict]]) -> FindResult:
        """
        The function executes search operations such as 'find' and 'filter' in the order
        they appear in the query. The 'find' operation performs a vector similarity search.
        The 'filter' operation filters out documents based on a filter query.
        The documents are finally sorted based on their scores.

        :param query: The query to execute.
        :return: A tuple of retrieved documents and their scores.
        """
        out_docs = self._docs
        doc_to_score: Dict[BaseDoc, Any] = {}
        for op, op_kwargs in query:
            if op == 'find':
                out_docs, scores = find(
                    index=out_docs,
                    query=op_kwargs['query'],
                    search_field=op_kwargs['search_field'],
                    limit=op_kwargs.get('limit', len(out_docs)),
                    metric=self._column_infos[op_kwargs['search_field']].config[
                        'space'
                    ],
                )
                doc_to_score.update(zip(out_docs.id, scores))
            elif op == 'filter':
                out_docs = filter_docs(out_docs, op_kwargs['filter_query'])
                if 'limit' in op_kwargs:
                    out_docs = out_docs[: op_kwargs['limit']]
            else:
                raise ValueError(f'Query operation is not supported: {op}')

        scores_and_docs = zip([doc_to_score[doc.id] for doc in out_docs], out_docs)
        sorted_lists = sorted(scores_and_docs, reverse=True)
        out_scores, out_docs = zip(*sorted_lists)

        return FindResult(documents=out_docs, scores=out_scores)

    def find(
        self,
        query: Union[AnyTensor, BaseDoc],
        search_field: str = '',
        limit: int = 10,
        **kwargs,
    ) -> FindResult:
        """Find Documents in the index using nearest-neighbor search.

        :param query: query vector for KNN/ANN search.
            Can be either a tensor-like (np.array, torch.Tensor, etc.)
            with a single axis, or a Document
        :param search_field: name of the field to search on.
            Documents in the index are retrieved based on this similarity
            of this field to the query.
        :param limit: maximum number of Documents to return
        :return: a named tuple containing `documents` and `scores`
        """
        self._logger.debug(f'Executing `find` for search field {search_field}')
        self._validate_search_field(search_field)

        if self._is_index_empty:
            return FindResult(documents=[], scores=[])  # type: ignore

        config = self._column_infos[search_field].config

        docs, scores = find(
            index=self._docs,
            query=query,
            search_field=search_field,
            limit=limit,
            metric=config['space'],
            cache=self._embedding_map,
        )

        docs_ = []
        for doc in docs:
            ori_doc = self._ori_items(doc)
            schema_cls = cast(Type[BaseDoc], self.out_schema)
            docs_.append(schema_cls(**ori_doc.__dict__))

        docs_with_schema = DocList.__class_getitem__(
            cast(Type[BaseDoc], self.out_schema)
        )(docs_)

        return FindResult(documents=docs_with_schema, scores=scores)

    def _find(
        self, query: np.ndarray, limit: int, search_field: str = ''
    ) -> _FindResult:
        raise NotImplementedError

    def find_batched(
        self,
        queries: Union[AnyTensor, DocList],
        search_field: str = '',
        limit: int = 10,
        **kwargs,
    ) -> FindResultBatched:
        """Find Documents in the index using nearest-neighbor search.

        :param queries: query vector for KNN/ANN search.
            Can be either a tensor-like (np.array, torch.Tensor, etc.) with a,
            or a DocList.
            If a tensor-like is passed, it should have shape (batch_size, vector_dim)
        :param search_field: name of the field to search on.
            Documents in the index are retrieved based on this similarity
            of this field to the query.
        :param limit: maximum number of documents to return per query
        :return: a named tuple containing `documents` and `scores`
        """
        self._logger.debug(f'Executing `find_batched` for search field {search_field}')
        self._validate_search_field(search_field)

        if self._is_index_empty:
            return FindResultBatched(documents=[], scores=[])  # type: ignore

        config = self._column_infos[search_field].config

        find_res = find_batched(
            index=self._docs,
            query=cast(NdArray, queries),
            search_field=search_field,
            limit=limit,
            metric=config['space'],
            cache=self._embedding_map,
        )

        return find_res

    def _find_batched(
        self, queries: np.ndarray, limit: int, search_field: str = ''
    ) -> _FindResultBatched:
        raise NotImplementedError

    def filter(
        self,
        filter_query: Any,
        limit: int = 10,
        **kwargs,
    ) -> DocList:
        """Find documents in the index based on a filter query

        :param filter_query: the filter query to execute following the query
            language of
        :param limit: maximum number of documents to return
        :return: a DocList containing the documents that match the filter query
        """
        self._logger.debug(f'Executing `filter` for the query {filter_query}')

        docs = filter_docs(docs=self._docs, query=filter_query)[:limit]
        return cast(DocList, docs)

    def _filter(self, filter_query: Any, limit: int) -> Union[DocList, List[Dict]]:
        raise NotImplementedError

    def _filter_batched(
        self, filter_queries: Any, limit: int
    ) -> Union[List[DocList], List[List[Dict]]]:
        raise NotImplementedError(f'{type(self)} does not support filtering.')

    def _text_search(
        self, query: str, limit: int, search_field: str = ''
    ) -> _FindResult:
        raise NotImplementedError(f'{type(self)} does not support text search.')

    def _text_search_batched(
        self, queries: Sequence[str], limit: int, search_field: str = ''
    ) -> _FindResultBatched:
        raise NotImplementedError(f'{type(self)} does not support text search.')

    def _doc_exists(self, doc_id: str) -> bool:
        return doc_id in self._get_ids_to_positions()

    def persist(self, file: Optional[str] = None) -> None:
        """Persist InMemoryExactNNIndex into a binary file."""
        DEFAULT_INDEX_FILE_PATH = 'in_memory_index.bin'
        file_to_save = self._index_file_path or file
        if file_to_save is None:
            self._logger.warning(
                f'persisting index to {DEFAULT_INDEX_FILE_PATH} because no `index_file_path` has been used inside DBConfig and no `file` has been passed as argument'
            )
        file_to_save = file_to_save or DEFAULT_INDEX_FILE_PATH
        self._docs.save_binary(file=file_to_save)

    def _get_root_doc_id(self, id: str, root: str, sub: str) -> str:
        """Get the root_id given the id of a subindex Document and the root and subindex name

        :param id: id of the subindex Document
        :param root: root index name
        :param sub: subindex name
        :return: the root_id of the Document
        """
        subindex: InMemoryExactNNIndex = cast(
            InMemoryExactNNIndex, self._subindices[root]
        )

        if not sub:
            sub_doc = subindex._get_items([id], raw=True)
            parent_id = (
                sub_doc[0]['parent_id']
                if isinstance(sub_doc[0], dict)
                else sub_doc[0].parent_id
            )
            return parent_id
        else:
            fields = sub.split('__')
            cur_root_id = subindex._get_root_doc_id(
                id, fields[0], '__'.join(fields[1:])
            )
            return self._get_root_doc_id(cur_root_id, root, '')

    def _get_ids_to_positions(self) -> Dict[str, int]:
        """
        Obtains a mapping between document IDs and their respective positions
        within the DocList. If this mapping hasn't been initialized, it will be created.

        :return: A dictionary mapping each document ID to its corresponding position.
        """
        if not self._ids_to_positions:
            self._update_ids_to_positions()
        return self._ids_to_positions

    def _update_ids_to_positions(self) -> None:
        """
        Generates or updates the mapping between document IDs and their corresponding
        positions within the DocList.
        """
        self._ids_to_positions = {doc.id: pos for pos, doc in enumerate(self._docs)}

`index_name` `property`

Return the name of the index in the database.

`out_schema: Type[BaseDoc]` `property`

Return the original schema (without the parent_id from new_schema type)

`DBConfig` `dataclass`

Bases: DBConfig

Dataclass that contains all "static" configurations of InMemoryExactNNIndex.

Source code in docarray/index/backends/in_memory.py

@dataclass
class DBConfig(BaseDocIndex.DBConfig):
    """Dataclass that contains all "static" configurations of InMemoryExactNNIndex."""

    index_file_path: Optional[str] = None
    default_column_config: Dict[Type, Dict[str, Any]] = field(
        default_factory=lambda: defaultdict(
            dict,
            {
                AbstractTensor: {'space': 'cosine_sim'},
            },
        )
    )

`QueryBuilder`

Bases: QueryBuilder

Source code in docarray/index/backends/in_memory.py

class QueryBuilder(BaseDocIndex.QueryBuilder):
    def __init__(self, query: Optional[List[Tuple[str, Dict]]] = None):
        super().__init__()
        # list of tuples (method name, kwargs)
        self._queries: List[Tuple[str, Dict]] = query or []

    def build(self, *args, **kwargs) -> Any:
        """Build the query object."""
        return self._queries

    find = _collect_query_args('find')
    find_batched = _collect_query_args('find_batched')
    filter = _collect_query_args('filter')
    filter_batched = _raise_not_supported('find_batched')
    text_search = _raise_not_supported('text_search')
    text_search_batched = _raise_not_supported('text_search')

`build(*args, **kwargs)`

Build the query object.

Source code in docarray/index/backends/in_memory.py

def build(self, *args, **kwargs) -> Any:
    """Build the query object."""
    return self._queries

`RuntimeConfig` `dataclass`

Bases: RuntimeConfig

Dataclass that contains all "dynamic" configurations of InMemoryExactNNIndex.

Source code in docarray/index/backends/in_memory.py

@dataclass
class RuntimeConfig(BaseDocIndex.RuntimeConfig):
    """Dataclass that contains all "dynamic" configurations of InMemoryExactNNIndex."""

    pass

`contains(item)`

Checks if a given document exists in the index.

Parameters:

Name	Type	Description	Default
`item`	`BaseDoc`	The document to check. It must be an instance of BaseDoc or its subclass.	required

Returns:

Type	Description
`bool`	True if the document exists in the index, False otherwise.

Source code in docarray/index/abstract.py

def __contains__(self, item: BaseDoc) -> bool:
    """
    Checks if a given document exists in the index.

    :param item: The document to check.
        It must be an instance of BaseDoc or its subclass.
    :return: True if the document exists in the index, False otherwise.
    """
    if safe_issubclass(type(item), BaseDoc):
        return self._doc_exists(str(item.id))
    else:
        raise TypeError(
            f"item must be an instance of BaseDoc or its subclass, not '{type(item).__name__}'"
        )

`delitem(key)`

Delete one or multiple Documents from the index, by id. If no document is found, a KeyError is raised.

Parameters:

Name	Type	Description	Default
`key`	`Union[str, Sequence[str]]`	id or ids to delete from the Document index	required

Source code in docarray/index/abstract.py

def __delitem__(self, key: Union[str, Sequence[str]]):
    """Delete one or multiple Documents from the index, by `id`.
    If no document is found, a KeyError is raised.

    :param key: id or ids to delete from the Document index
    """
    self._logger.info(f'Deleting documents with id(s) {key} from the index')
    if isinstance(key, str):
        key = [key]

    # delete nested data
    for field_name, type_, _ in self._flatten_schema(
        cast(Type[BaseDoc], self._schema)
    ):
        if safe_issubclass(type_, AnyDocArray):
            for doc_id in key:
                nested_docs_id = self._subindices[field_name]._filter_by_parent_id(
                    doc_id
                )
                if nested_docs_id:
                    del self._subindices[field_name][nested_docs_id]
    # delete data
    self._del_items(key)

`getitem(key)`

Get one or multiple Documents into the index, by id. If no document is found, a KeyError is raised.

Parameters:

Name	Type	Description	Default
`key`	`Union[str, Sequence[str]]`	id or ids to get from the Document index	required

Source code in docarray/index/abstract.py

def __getitem__(
    self, key: Union[str, Sequence[str]]
) -> Union[TSchema, DocList[TSchema]]:
    """Get one or multiple Documents into the index, by `id`.
    If no document is found, a KeyError is raised.

    :param key: id or ids to get from the Document index
    """
    # normalize input
    if isinstance(key, str):
        return_singleton = True
        key = [key]
    else:
        return_singleton = False

    # retrieve data
    doc_sequence = self._get_items(key)

    # check data
    if len(doc_sequence) == 0:
        raise KeyError(f'No document with id {key} found')

    # retrieve nested data
    for field_name, type_, _ in self._flatten_schema(
        cast(Type[BaseDoc], self._schema)
    ):
        if safe_issubclass(type_, AnyDocArray) and isinstance(
            doc_sequence[0], Dict
        ):
            for doc in doc_sequence:
                self._get_subindex_doclist(doc, field_name)  # type: ignore

    # cast output
    if isinstance(doc_sequence, DocList):
        out_docs: DocList[TSchema] = doc_sequence
    elif isinstance(doc_sequence[0], Dict):
        out_docs = self._dict_list_to_docarray(doc_sequence)  # type: ignore
    else:
        docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema))
        out_docs = docs_cls(doc_sequence)

    return out_docs[0] if return_singleton else out_docs

`init(docs=None, db_config=None, **kwargs)`

Initialize InMemoryExactNNIndex

Source code in docarray/index/backends/in_memory.py

def __init__(
    self,
    docs: Optional[DocList] = None,
    db_config=None,
    **kwargs,
):
    """Initialize InMemoryExactNNIndex"""
    super().__init__(db_config=db_config, **kwargs)
    self._runtime_config = self.RuntimeConfig()
    self._db_config = cast(InMemoryExactNNIndex.DBConfig, self._db_config)
    self._index_file_path = self._db_config.index_file_path

    if docs and self._index_file_path:
        raise ValueError(
            'Initialize `InMemoryExactNNIndex` with either `docs` or '
            '`index_file_path`, not both. Provide `docs` for a fresh index, or '
            '`index_file_path` to use an existing file.'
        )

    if self._index_file_path:
        if os.path.exists(self._index_file_path):
            self._logger.info(
                f'Loading index from a binary file: {self._index_file_path}'
            )
            self._docs = DocList.__class_getitem__(
                cast(Type[BaseDoc], self._schema)
            ).load_binary(file=self._index_file_path)

            data_by_columns = self._get_col_value_dict(self._docs)
            self._update_subindex_data(self._docs)
            self._index_subindex(data_by_columns)

        else:
            self._logger.warning(
                f'Index file does not exist: {self._index_file_path}. '
                f'Initializing empty InMemoryExactNNIndex.'
            )
            self._docs = DocList.__class_getitem__(
                cast(Type[BaseDoc], self._schema)
            )()
    else:
        if docs:
            self._logger.info('Docs provided. Initializing with provided docs.')
            self._docs = docs
        else:
            self._logger.info(
                'No docs or index file provided. Initializing empty InMemoryExactNNIndex.'
            )
            self._docs = DocList.__class_getitem__(
                cast(Type[BaseDoc], self._schema)
            )()

    self._embedding_map: Dict[str, Tuple[AnyTensor, Optional[List[int]]]] = {}
    self._ids_to_positions: Dict[str, int] = {}

`build_query()`

Build a query for this DocumentIndex.

Returns:

Type	Description
`QueryBuilder`	a new `QueryBuilder` object for this DocumentIndex

Source code in docarray/index/abstract.py

def build_query(self) -> QueryBuilder:
    """
    Build a query for this DocumentIndex.

    :return: a new `QueryBuilder` object for this DocumentIndex
    """
    return self.QueryBuilder()  # type: ignore

`configure(runtime_config=None, **kwargs)`

Configure the DocumentIndex. You can either pass a config object to config or pass individual config parameters as keyword arguments. If a configuration object is passed, it will replace the current configuration. If keyword arguments are passed, they will update the current configuration.

Parameters:

Name	Type	Description	Default
`runtime_config`		the configuration to apply	`None`
`kwargs`		individual configuration parameters	`{}`

Source code in docarray/index/abstract.py

def configure(self, runtime_config=None, **kwargs):
    """
    Configure the DocumentIndex.
    You can either pass a config object to `config` or pass individual config
    parameters as keyword arguments.
    If a configuration object is passed, it will replace the current configuration.
    If keyword arguments are passed, they will update the current configuration.

    :param runtime_config: the configuration to apply
    :param kwargs: individual configuration parameters
    """
    if runtime_config is None:
        self._runtime_config = replace(self._runtime_config, **kwargs)
    else:
        if not isinstance(runtime_config, self.RuntimeConfig):
            raise ValueError(f'runtime_config must be of type {self.RuntimeConfig}')
        self._runtime_config = runtime_config

`execute_query(query, *args, **kwargs)`

Execute a query on the InMemoryExactNNIndex.

Can take two kinds of inputs:

A native query of the underlying database. This is meant as a passthrough so that you can enjoy any functionality that is not available through the Document index API.
The output of this Document index' QueryBuilder.build() method.

Parameters:

Name	Type	Description	Default
`query`	`List[Tuple[str, Dict]]`	the query to execute	required
`args`		positional arguments to pass to the query	`()`
`kwargs`		keyword arguments to pass to the query	`{}`

Returns:

Type	Description
`Any`	the result of the query

Source code in docarray/index/backends/in_memory.py

def execute_query(self, query: List[Tuple[str, Dict]], *args, **kwargs) -> Any:
    """
    Execute a query on the InMemoryExactNNIndex.

    Can take two kinds of inputs:

    1. A native query of the underlying database. This is meant as a passthrough so that you
    can enjoy any functionality that is not available through the Document index API.
    2. The output of this Document index' `QueryBuilder.build()` method.

    :param query: the query to execute
    :param args: positional arguments to pass to the query
    :param kwargs: keyword arguments to pass to the query
    :return: the result of the query
    """
    if args or kwargs:
        raise ValueError(
            f'args and kwargs not supported for `execute_query` on {type(self)}'
        )
    return self._find_and_filter(query)

`filter(filter_query, limit=10, **kwargs)`

Find documents in the index based on a filter query

Parameters:

Name	Type	Description	Default
`filter_query`	`Any`	the filter query to execute following the query language of	required
`limit`	`int`	maximum number of documents to return	`10`

Returns:

Type	Description
`DocList`	a DocList containing the documents that match the filter query

Source code in docarray/index/backends/in_memory.py

def filter(
    self,
    filter_query: Any,
    limit: int = 10,
    **kwargs,
) -> DocList:
    """Find documents in the index based on a filter query

    :param filter_query: the filter query to execute following the query
        language of
    :param limit: maximum number of documents to return
    :return: a DocList containing the documents that match the filter query
    """
    self._logger.debug(f'Executing `filter` for the query {filter_query}')

    docs = filter_docs(docs=self._docs, query=filter_query)[:limit]
    return cast(DocList, docs)

`filter_batched(filter_queries, limit=10, **kwargs)`

Find documents in the index based on multiple filter queries.

Parameters:

Name	Type	Description	Default
`filter_queries`	`Any`	the DB specific filter query to execute	required
`limit`	`int`	maximum number of documents to return	`10`

Returns:

Type	Description
`List[DocList]`	a DocList containing the documents that match the filter query

Source code in docarray/index/abstract.py

def filter_batched(
    self,
    filter_queries: Any,
    limit: int = 10,
    **kwargs,
) -> List[DocList]:
    """Find documents in the index based on multiple filter queries.

    :param filter_queries: the DB specific filter query to execute
    :param limit: maximum number of documents to return
    :return: a DocList containing the documents that match the filter query
    """
    self._logger.debug(
        f'Executing `filter_batched` for the queries {filter_queries}'
    )
    da_list = self._filter_batched(filter_queries, limit=limit, **kwargs)

    if len(da_list) > 0 and isinstance(da_list[0], List):
        da_list = [self._dict_list_to_docarray(docs) for docs in da_list]

    return da_list  # type: ignore

`filter_subindex(filter_query, subindex, limit=10, **kwargs)`

Find documents in subindex level based on a filter query

Parameters:

Name	Type	Description	Default
`filter_query`	`Any`	the DB specific filter query to execute	required
`subindex`	`str`	name of the subindex to search on	required
`limit`	`int`	maximum number of documents to return	`10`

Returns:

Type	Description
`DocList`	a DocList containing the subindex level documents that match the filter query

Source code in docarray/index/abstract.py

def filter_subindex(
    self,
    filter_query: Any,
    subindex: str,
    limit: int = 10,
    **kwargs,
) -> DocList:
    """Find documents in subindex level based on a filter query

    :param filter_query: the DB specific filter query to execute
    :param subindex: name of the subindex to search on
    :param limit: maximum number of documents to return
    :return: a DocList containing the subindex level documents that match the filter query
    """
    self._logger.debug(
        f'Executing `filter` for the query {filter_query} in subindex {subindex}'
    )
    if '__' in subindex:
        fields = subindex.split('__')
        return self._subindices[fields[0]].filter_subindex(
            filter_query, '__'.join(fields[1:]), limit=limit, **kwargs
        )
    else:
        return self._subindices[subindex].filter(
            filter_query, limit=limit, **kwargs
        )

`find(query, search_field='', limit=10, **kwargs)`

Find Documents in the index using nearest-neighbor search.

Parameters:

Name	Type	Description	Default
`query`	`Union[AnyTensor, BaseDoc]`	query vector for KNN/ANN search. Can be either a tensor-like (np.array, torch.Tensor, etc.) with a single axis, or a Document	required
`search_field`	`str`	name of the field to search on. Documents in the index are retrieved based on this similarity of this field to the query.	`''`
`limit`	`int`	maximum number of Documents to return	`10`

Returns:

Type	Description
`FindResult`	a named tuple containing `documents` and `scores`

Source code in docarray/index/backends/in_memory.py

def find(
    self,
    query: Union[AnyTensor, BaseDoc],
    search_field: str = '',
    limit: int = 10,
    **kwargs,
) -> FindResult:
    """Find Documents in the index using nearest-neighbor search.

    :param query: query vector for KNN/ANN search.
        Can be either a tensor-like (np.array, torch.Tensor, etc.)
        with a single axis, or a Document
    :param search_field: name of the field to search on.
        Documents in the index are retrieved based on this similarity
        of this field to the query.
    :param limit: maximum number of Documents to return
    :return: a named tuple containing `documents` and `scores`
    """
    self._logger.debug(f'Executing `find` for search field {search_field}')
    self._validate_search_field(search_field)

    if self._is_index_empty:
        return FindResult(documents=[], scores=[])  # type: ignore

    config = self._column_infos[search_field].config

    docs, scores = find(
        index=self._docs,
        query=query,
        search_field=search_field,
        limit=limit,
        metric=config['space'],
        cache=self._embedding_map,
    )

    docs_ = []
    for doc in docs:
        ori_doc = self._ori_items(doc)
        schema_cls = cast(Type[BaseDoc], self.out_schema)
        docs_.append(schema_cls(**ori_doc.__dict__))

    docs_with_schema = DocList.__class_getitem__(
        cast(Type[BaseDoc], self.out_schema)
    )(docs_)

    return FindResult(documents=docs_with_schema, scores=scores)

`find_batched(queries, search_field='', limit=10, **kwargs)`

Find Documents in the index using nearest-neighbor search.

Parameters:

Name	Type	Description	Default
`queries`	`Union[AnyTensor, DocList]`	query vector for KNN/ANN search. Can be either a tensor-like (np.array, torch.Tensor, etc.) with a, or a DocList. If a tensor-like is passed, it should have shape (batch_size, vector_dim)	required
`search_field`	`str`	name of the field to search on. Documents in the index are retrieved based on this similarity of this field to the query.	`''`
`limit`	`int`	maximum number of documents to return per query	`10`

Returns:

Type	Description
`FindResultBatched`	a named tuple containing `documents` and `scores`

Source code in docarray/index/backends/in_memory.py

def find_batched(
    self,
    queries: Union[AnyTensor, DocList],
    search_field: str = '',
    limit: int = 10,
    **kwargs,
) -> FindResultBatched:
    """Find Documents in the index using nearest-neighbor search.

    :param queries: query vector for KNN/ANN search.
        Can be either a tensor-like (np.array, torch.Tensor, etc.) with a,
        or a DocList.
        If a tensor-like is passed, it should have shape (batch_size, vector_dim)
    :param search_field: name of the field to search on.
        Documents in the index are retrieved based on this similarity
        of this field to the query.
    :param limit: maximum number of documents to return per query
    :return: a named tuple containing `documents` and `scores`
    """
    self._logger.debug(f'Executing `find_batched` for search field {search_field}')
    self._validate_search_field(search_field)

    if self._is_index_empty:
        return FindResultBatched(documents=[], scores=[])  # type: ignore

    config = self._column_infos[search_field].config

    find_res = find_batched(
        index=self._docs,
        query=cast(NdArray, queries),
        search_field=search_field,
        limit=limit,
        metric=config['space'],
        cache=self._embedding_map,
    )

    return find_res

`find_subindex(query, subindex='', search_field='', limit=10, **kwargs)`

Find documents in subindex level.

Parameters:

Name	Type	Description	Default
`query`	`Union[AnyTensor, BaseDoc]`	query vector for KNN/ANN search. Can be either a tensor-like (np.array, torch.Tensor, etc.) with a single axis, or a Document	required
`subindex`	`str`	name of the subindex to search on	`''`
`search_field`	`str`	name of the field to search on	`''`
`limit`	`int`	maximum number of documents to return	`10`

Returns:

Type	Description
`SubindexFindResult`	a named tuple containing root docs, subindex docs and scores

Source code in docarray/index/abstract.py

def find_subindex(
    self,
    query: Union[AnyTensor, BaseDoc],
    subindex: str = '',
    search_field: str = '',
    limit: int = 10,
    **kwargs,
) -> SubindexFindResult:
    """Find documents in subindex level.

    :param query: query vector for KNN/ANN search.
        Can be either a tensor-like (np.array, torch.Tensor, etc.)
        with a single axis, or a Document
    :param subindex: name of the subindex to search on
    :param search_field: name of the field to search on
    :param limit: maximum number of documents to return
    :return: a named tuple containing root docs, subindex docs and scores
    """
    self._logger.debug(f'Executing `find_subindex` for search field {search_field}')

    sub_docs, scores = self._find_subdocs(
        query, subindex=subindex, search_field=search_field, limit=limit, **kwargs
    )

    fields = subindex.split('__')
    root_ids = [
        self._get_root_doc_id(doc.id, fields[0], '__'.join(fields[1:]))
        for doc in sub_docs
    ]
    root_docs = DocList[self._schema]()  # type: ignore
    for id in root_ids:
        root_docs.append(self[id])

    return SubindexFindResult(
        root_documents=root_docs, sub_documents=sub_docs, scores=scores  # type: ignore
    )

`index(docs, **kwargs)`

index Documents into the index.

Note

Passing a sequence of Documents that is not a DocList (such as a List of Docs) comes at a performance penalty. This is because the Index needs to check compatibility between itself and the data. With a DocList as input this is a single check; for other inputs compatibility needs to be checked for every Document individually.

Parameters:

Name	Type	Description	Default
`docs`	`Union[BaseDoc, Sequence[BaseDoc]]`	Documents to index.	required

Source code in docarray/index/backends/in_memory.py

def index(self, docs: Union[BaseDoc, Sequence[BaseDoc]], **kwargs):
    """index Documents into the index.

    !!! note
        Passing a sequence of Documents that is not a DocList
        (such as a List of Docs) comes at a performance penalty.
        This is because the Index needs to check compatibility between itself and
        the data. With a DocList as input this is a single check; for other inputs
        compatibility needs to be checked for every Document individually.

    :param docs: Documents to index.
    """
    # implementing the public option because conversion to column dict is not needed
    docs = self._validate_docs(docs)
    ids_to_positions = self._get_ids_to_positions()
    for doc in docs:
        if doc.id in ids_to_positions:
            self._docs[ids_to_positions[doc.id]] = doc
        else:
            self._docs.append(doc)
            self._ids_to_positions[str(doc.id)] = len(self._ids_to_positions)

    # Add parent_id to all sub-index documents and store sub-index documents
    data_by_columns = self._get_col_value_dict(docs)
    self._update_subindex_data(docs)
    self._index_subindex(data_by_columns)

    self._rebuild_embedding()

`num_docs()`

Get the number of documents.

Source code in docarray/index/backends/in_memory.py

def num_docs(self) -> int:
    """
    Get the number of documents.
    """
    return len(self._docs)

`persist(file=None)`

Persist InMemoryExactNNIndex into a binary file.

Source code in docarray/index/backends/in_memory.py

def persist(self, file: Optional[str] = None) -> None:
    """Persist InMemoryExactNNIndex into a binary file."""
    DEFAULT_INDEX_FILE_PATH = 'in_memory_index.bin'
    file_to_save = self._index_file_path or file
    if file_to_save is None:
        self._logger.warning(
            f'persisting index to {DEFAULT_INDEX_FILE_PATH} because no `index_file_path` has been used inside DBConfig and no `file` has been passed as argument'
        )
    file_to_save = file_to_save or DEFAULT_INDEX_FILE_PATH
    self._docs.save_binary(file=file_to_save)

`python_type_to_db_type(python_type)`

Map python type to database type. Takes any python type and returns the corresponding database column type.

Parameters:

Name	Type	Description	Default
`python_type`	`Type`	a python type.	required

Returns:

Type	Description
`Any`	the corresponding database column type, or None if `python_type` is not supported.

Source code in docarray/index/backends/in_memory.py

def python_type_to_db_type(self, python_type: Type) -> Any:
    """Map python type to database type.
    Takes any python type and returns the corresponding database column type.

    :param python_type: a python type.
    :return: the corresponding database column type,
        or None if ``python_type`` is not supported.
    """
    return python_type

`subindex_contains(item)`

Checks if a given BaseDoc item is contained in the index or any of its subindices.

Parameters:

Name	Type	Description	Default
`item`	`BaseDoc`	the given BaseDoc	required

Returns:

Type	Description
`bool`	if the given BaseDoc item is contained in the index/subindices

Source code in docarray/index/abstract.py

def subindex_contains(self, item: BaseDoc) -> bool:
    """Checks if a given BaseDoc item is contained in the index or any of its subindices.

    :param item: the given BaseDoc
    :return: if the given BaseDoc item is contained in the index/subindices
    """
    if self._is_index_empty:
        return False

    if safe_issubclass(type(item), BaseDoc):
        return self.__contains__(item) or any(
            index.subindex_contains(item) for index in self._subindices.values()
        )
    else:
        raise TypeError(
            f"item must be an instance of BaseDoc or its subclass, not '{type(item).__name__}'"
        )

`text_search(query, search_field='', limit=10, **kwargs)`

Find documents in the index based on a text search query.

Parameters:

Name	Type	Description	Default
`query`	`Union[str, BaseDoc]`	The text to search for	required
`search_field`	`str`	name of the field to search on	`''`
`limit`	`int`	maximum number of documents to return	`10`

Returns:

Type	Description
`FindResult`	a named tuple containing `documents` and `scores`

Source code in docarray/index/abstract.py

def text_search(
    self,
    query: Union[str, BaseDoc],
    search_field: str = '',
    limit: int = 10,
    **kwargs,
) -> FindResult:
    """Find documents in the index based on a text search query.

    :param query: The text to search for
    :param search_field: name of the field to search on
    :param limit: maximum number of documents to return
    :return: a named tuple containing `documents` and `scores`
    """
    self._logger.debug(f'Executing `text_search` for search field {search_field}')
    self._validate_search_field(search_field)
    if isinstance(query, BaseDoc):
        query_text = self._get_values_by_column([query], search_field)[0]
    else:
        query_text = query
    docs, scores = self._text_search(
        query_text, search_field=search_field, limit=limit, **kwargs
    )

    if isinstance(docs, List) and not isinstance(docs, DocList):
        docs = self._dict_list_to_docarray(docs)

    return FindResult(documents=docs, scores=scores)

`text_search_batched(queries, search_field='', limit=10, **kwargs)`

Find documents in the index based on a text search query.

Parameters:

Name	Type	Description	Default
`queries`	`Union[Sequence[str], Sequence[BaseDoc]]`	The texts to search for	required
`search_field`	`str`	name of the field to search on	`''`
`limit`	`int`	maximum number of documents to return	`10`

Returns:

Type	Description
`FindResultBatched`	a named tuple containing `documents` and `scores`

Source code in docarray/index/abstract.py

def text_search_batched(
    self,
    queries: Union[Sequence[str], Sequence[BaseDoc]],
    search_field: str = '',
    limit: int = 10,
    **kwargs,
) -> FindResultBatched:
    """Find documents in the index based on a text search query.

    :param queries: The texts to search for
    :param search_field: name of the field to search on
    :param limit: maximum number of documents to return
    :return: a named tuple containing `documents` and `scores`
    """
    self._logger.debug(
        f'Executing `text_search_batched` for search field {search_field}'
    )
    self._validate_search_field(search_field)
    if isinstance(queries[0], BaseDoc):
        query_docs: Sequence[BaseDoc] = cast(Sequence[BaseDoc], queries)
        query_texts: Sequence[str] = self._get_values_by_column(
            query_docs, search_field
        )
    else:
        query_texts = cast(Sequence[str], queries)
    da_list, scores = self._text_search_batched(
        query_texts, search_field=search_field, limit=limit, **kwargs
    )

    if len(da_list) > 0 and isinstance(da_list[0], List):
        docs = [self._dict_list_to_docarray(docs) for docs in da_list]
        return FindResultBatched(documents=docs, scores=scores)

    da_list_ = cast(List[DocList], da_list)
    return FindResultBatched(documents=da_list_, scores=scores)

InMemoryExactNNIndex

docarray.index.backends.in_memory.InMemoryExactNNIndex

index_name property

out_schema: Type[BaseDoc] property

DBConfig dataclass

QueryBuilder

build(*args, **kwargs)

RuntimeConfig dataclass

__contains__(item)

__delitem__(key)

__getitem__(key)

__init__(docs=None, db_config=None, **kwargs)

build_query()

configure(runtime_config=None, **kwargs)

execute_query(query, *args, **kwargs)

filter(filter_query, limit=10, **kwargs)

filter_batched(filter_queries, limit=10, **kwargs)

filter_subindex(filter_query, subindex, limit=10, **kwargs)

find(query, search_field='', limit=10, **kwargs)

find_batched(queries, search_field='', limit=10, **kwargs)

find_subindex(query, subindex='', search_field='', limit=10, **kwargs)

index(docs, **kwargs)

num_docs()

persist(file=None)

python_type_to_db_type(python_type)

subindex_contains(item)

text_search(query, search_field='', limit=10, **kwargs)

text_search_batched(queries, search_field='', limit=10, **kwargs)

`docarray.index.backends.in_memory.InMemoryExactNNIndex`

`index_name` `property`

`out_schema: Type[BaseDoc]` `property`

`DBConfig` `dataclass`

`QueryBuilder`

`build(*args, **kwargs)`

`RuntimeConfig` `dataclass`

`contains(item)`

`delitem(key)`

`getitem(key)`

`init(docs=None, db_config=None, **kwargs)`

`build_query()`

`configure(runtime_config=None, **kwargs)`

`execute_query(query, *args, **kwargs)`

`filter(filter_query, limit=10, **kwargs)`

`filter_batched(filter_queries, limit=10, **kwargs)`

`filter_subindex(filter_query, subindex, limit=10, **kwargs)`

`find(query, search_field='', limit=10, **kwargs)`

`find_batched(queries, search_field='', limit=10, **kwargs)`

`find_subindex(query, subindex='', search_field='', limit=10, **kwargs)`

`index(docs, **kwargs)`

`num_docs()`

`persist(file=None)`

`python_type_to_db_type(python_type)`

`subindex_contains(item)`

`text_search(query, search_field='', limit=10, **kwargs)`

`text_search_batched(queries, search_field='', limit=10, **kwargs)`