EpsillaDocumentIndex

`docarray.index.backends.epsilla.EpsillaDocumentIndex`

Bases: BaseDocIndex, Generic[TSchema]

Source code in docarray/index/backends/epsilla.py

class EpsillaDocumentIndex(BaseDocIndex, Generic[TSchema]):
    def __init__(self, db_config=None, **kwargs):
        # will set _db_config from args / kwargs
        super().__init__(db_config=db_config, **kwargs)

        self._db_config: EpsillaDocumentIndex.DBConfig = cast(
            EpsillaDocumentIndex.DBConfig, self._db_config
        )
        self._db_config.validate_config()
        self._validate_column_info()

        self._table_name = (
            self._db_config.table_name
            if self._db_config.table_name
            else self._schema.__name__
        )

        if self._db_config.is_self_hosted:
            self._db = vectordb.Client(
                protocol=self._db_config.protocol,
                host=self._db_config.host,
                port=self._db_config.port,
            )
            status_code, response = self._db.load_db(
                db_name=self._db_config.db_name,
                db_path=self._db_config.db_path,
            )

            if status_code != HTTPStatus.OK:
                if status_code == HTTPStatus.CONFLICT:
                    self._logger.info(f'{self._db_config.db_name} already loaded.')
                else:
                    raise IOError(
                        f"Failed to load database {self._db_config.db_name}. "
                        f"Error code: {status_code}. Error message: {response}."
                    )
            self._db.use_db(self._db_config.db_name)

            status_code, response = self._db.list_tables()
            if status_code != HTTPStatus.OK:
                raise IOError(
                    f"Failed to list tables. "
                    f"Error code: {status_code}. Error message: {response}."
                )

            if self._table_name not in response["result"]:
                self._create_table_self_hosted()
        else:
            self._client = cloud.Client(
                project_id=self._db_config.cloud_project_id,
                api_key=self._db_config.api_key,
            )
            self._db = self._client.vectordb(self._db_config.cloud_db_id)

            status_code, response = self._db.list_tables()
            if status_code != HTTPStatus.OK:
                raise IOError(
                    f"Failed to list tables. "
                    f"Error code: {status_code}. Error message: {response}."
                )

            # Epsilla cloud requires table to be created in the web UI before inserting data
            # It does not support creating tables from Python client yet.

    def _validate_column_info(self):
        vector_columns = []
        for info in self._column_infos.values():
            for type in [list, np.ndarray, AbstractTensor]:
                if safe_issubclass(info.docarray_type, type) and info.config.get(
                    'is_embedding', False
                ):
                    # check that dimension is present
                    if info.n_dim is None and info.config.get('dim', None) is None:
                        raise ValueError("The dimension information is missing")

                    vector_columns.append(info.docarray_type)
                    break

        if len(vector_columns) == 0:
            raise ValueError(
                "Unable to find any vector columns. Please make sure that at least one "
                "column is of a vector type with the is_embedding=True attribute specified."
            )
        elif len(vector_columns) > 1:
            raise ValueError("Specifying multiple vector fields is not supported.")

    def _create_table_self_hosted(self):
        """Use _column_infos to create a table in the database."""
        table_fields = []

        primary_keys = []
        for column_name, column_info in self._column_infos.items():
            if column_info.docarray_type == ID:
                primary_keys.append(column_name)

        # when there is a nested schema, we may have multiple "ID" fields. We use the presence of "__"
        # to determine if the field is nested or not
        if len(primary_keys) > 1:
            sorted_pkeys = sorted(primary_keys, key=lambda x: x.count("__"))
            primary_keys = sorted_pkeys[:1]

        for column_name, column_info in self._column_infos.items():
            dim = (
                column_info.n_dim
                if column_info.n_dim is not None
                else column_info.config.get('dim', None)
            )
            if dim is None:
                table_fields.append(
                    {
                        'name': column_name,
                        'dataType': column_info.db_type,
                        'primaryKey': column_name in primary_keys,
                    }
                )
            else:
                table_fields.append(
                    {
                        'name': column_name,
                        'dataType': column_info.db_type,
                        'dimensions': dim,
                    }
                )

        status_code, response = self._db.create_table(
            table_name=self._table_name,
            table_fields=table_fields,
        )
        if status_code != HTTPStatus.OK:
            raise IOError(
                f"Failed to create table {self._table_name}. "
                f"Error code: {status_code}. Error message: {response}."
            )

    @dataclass
    class Query:
        """Dataclass describing a query."""

        vector_field: Optional[str]
        vector_query: Optional[NdArray]
        filter: Optional[str]
        limit: int

    class QueryBuilder(BaseDocIndex.QueryBuilder):
        def __init__(
            self,
            vector_search_field: Optional[str] = None,
            vector_queries: Optional[List[NdArray]] = None,
            filter: Optional[str] = None,
        ):
            self._vector_search_field: Optional[str] = vector_search_field
            self._vector_queries: List[NdArray] = vector_queries or []
            self._filter: Optional[str] = filter

        def find(self, query: NdArray, search_field: str = ''):
            if self._vector_search_field and self._vector_search_field != search_field:
                raise ValueError(
                    f'Trying to call .find for search_field = {search_field}, but '
                    f'previously {self._vector_search_field} was used. Only a single '
                    f'field might be used in chained calls.'
                )
            return EpsillaDocumentIndex.QueryBuilder(
                vector_search_field=search_field,
                vector_queries=self._vector_queries + [query],
                filter=self._filter,
            )

        def filter(self, filter_query: str):  # type: ignore[override]
            return EpsillaDocumentIndex.QueryBuilder(
                vector_search_field=self._vector_search_field,
                vector_queries=self._vector_queries,
                filter=filter_query,
            )

        def build(self, limit: int) -> Any:
            if len(self._vector_queries) > 0:
                # If there are multiple vector queries applied, we can average them and
                # perform semantic search on a single vector instead
                vector_query = np.average(self._vector_queries, axis=0)
            else:
                vector_query = None
            return EpsillaDocumentIndex.Query(
                vector_field=self._vector_search_field,
                vector_query=vector_query,
                filter=self._filter,
                limit=limit,
            )

        find_batched = _raise_not_composable('find_batched')
        filter_batched = _raise_not_composable('filter_batched')
        text_search = _raise_not_supported('text_search')
        text_search_batched = _raise_not_supported('text_search_batched')

    @dataclass
    class DBConfig(BaseDocIndex.DBConfig):
        """Static configuration for EpsillaDocumentIndex"""

        # default value is the schema type name
        table_name: Optional[str] = None

        # Indicator for self-hosted or cloud version
        is_self_hosted: bool = False

        # self-hosted version uses the following configs
        protocol: Optional[str] = None
        host: Optional[str] = None
        port: Optional[int] = 8888
        db_path: Optional[str] = None
        db_name: Optional[str] = None

        # cloud version uses the following configs
        cloud_project_id: Optional[str] = None
        cloud_db_id: Optional[str] = None
        api_key: Optional[str] = None

        default_column_config: Dict[Any, Dict[str, Any]] = field(
            default_factory=lambda: {
                'TINYINT': {},
                'SMALLINT': {},
                'INT': {},
                'BIGINT': {},
                'FLOAT': {},
                'DOUBLE': {},
                'STRING': {},
                'BOOL': {},
                'JSON': {},
                'VECTOR_FLOAT': {},
            }
        )

        def validate_config(self):
            if self.is_self_hosted:
                self.validate_self_hosted_config()
            else:
                self.validate_cloud_config()

        def validate_self_hosted_config(self):
            missing_attributes = [
                attr
                for attr in ["protocol", "host", "port", "db_path", "db_name"]
                if getattr(self, attr, None) is None
            ]

            if missing_attributes:
                raise ValueError(
                    f"Missing required attributes for self-hosted version: {', '.join(missing_attributes)}"
                )

        def validate_cloud_config(self):
            missing_attributes_cloud = [
                attr
                for attr in ["cloud_project_id", "cloud_db_id", "api_key"]
                if getattr(self, attr, None) is None
            ]

            if missing_attributes_cloud:
                raise ValueError(
                    f"Missing required attributes for cloud version: {', '.join(missing_attributes_cloud)}"
                )

    @dataclass
    class RuntimeConfig(BaseDocIndex.RuntimeConfig):
        # No dynamic config used
        pass

    @property
    def collection_name(self):
        return self._db_config.table_name

    @property
    def index_name(self):
        return self.collection_name

    def python_type_to_db_type(self, python_type: Type) -> str:
        # AbstractTensor does not have n_dims, which is required by Epsilla
        # Use NdArray instead
        for allowed_type in [list, np.ndarray, AbstractTensor]:
            if safe_issubclass(python_type, allowed_type):
                return 'VECTOR_FLOAT'

        py_type_map = {
            ID: 'STRING',
            str: 'STRING',
            bytes: 'STRING',
            int: 'BIGINT',
            float: 'FLOAT',
            bool: 'BOOL',
            np.ndarray: 'VECTOR_FLOAT',
        }

        for py_type, epsilla_type in py_type_map.items():
            if safe_issubclass(python_type, py_type):
                return epsilla_type

        raise ValueError(f'Unsupported column type for {type(self)}: {python_type}')

    def _index(self, column_to_data: Dict[str, Generator[Any, None, None]]):
        self._index_subindex(column_to_data)

        rows = list(self._transpose_col_value_dict(column_to_data))
        normalized_rows = []
        for row in rows:
            normalized_row = {}
            for key, value in row.items():
                if isinstance(value, NdArray):
                    normalized_row[key] = value.tolist()
                elif isinstance(value, np.ndarray):
                    normalized_row[key] = value.tolist()
                else:
                    normalized_row[key] = value
            normalized_rows.append(normalized_row)

        status_code, response = self._db.insert(
            table_name=self._table_name, records=normalized_rows
        )

        if status_code != HTTPStatus.OK:
            raise IOError(
                f"Failed to insert documents. "
                f"Error code: {status_code}. Error message: {response}."
            )

    def num_docs(self) -> int:
        raise NotImplementedError

    @property
    def _is_index_empty(self) -> bool:
        """
        Check if index is empty by comparing the number of documents to zero.
        :return: True if the index is empty, False otherwise.
        """
        # Overriding this method to always return False because Epsilla does not have a count API for num_docs
        return False

    def _del_items(self, doc_ids: Sequence[str]):
        status_code, response = self._db.delete(
            table_name=self._table_name,
            primary_keys=list(doc_ids),
        )
        if status_code != HTTPStatus.OK:
            raise IOError(
                f"Failed to get documents with ids {doc_ids}. "
                f"Error code: {status_code}. Error message: {response}."
            )
        return response['message']

    def _get_items(
        self, doc_ids: Sequence[str]
    ) -> Union[Sequence[TSchema], Sequence[Dict[str, Any]]]:
        status_code, response = self._db.get(
            table_name=self._table_name,
            primary_keys=list(doc_ids),
        )
        if status_code != HTTPStatus.OK:
            raise IOError(
                f"Failed to get documents with ids {doc_ids}. "
                f"Error code: {status_code}. Error message: {response}."
            )
        return response['result']

    def execute_query(self, query: Query) -> DocList:
        if query.vector_query is not None:
            result = self._find_with_filter_batched(
                queries=np.expand_dims(query.vector_query, axis=0),
                filter=query.filter,
                limit=query.limit,
                search_field=query.vector_field,
            )
            return self._dict_list_to_docarray(result.documents[0])
        else:
            return self._dict_list_to_docarray(
                self._filter(
                    filter_query=query.filter,
                    limit=query.limit,
                )
            )

    def _doc_exists(self, doc_id: str) -> bool:
        return len(self._get_items([doc_id])) > 0

    def _find(
        self,
        query: np.ndarray,
        limit: int,
        search_field: str = '',
    ) -> _FindResult:
        query_batched = np.expand_dims(query, axis=0)
        docs, scores = self._find_batched(
            queries=query_batched, limit=limit, search_field=search_field
        )
        return _FindResult(documents=docs[0], scores=scores[0])

    def _find_batched(
        self,
        queries: np.ndarray,
        limit: int,
        search_field: str = '',
    ) -> _FindResultBatched:
        return self._find_with_filter_batched(
            queries=queries, limit=limit, search_field=search_field
        )

    def _find_with_filter_batched(
        self,
        queries: np.ndarray,
        limit: int,
        search_field: str,
        filter: Optional[str] = None,
    ) -> _FindResultBatched:
        if search_field == '':
            raise ValueError(
                'EpsillaDocumentIndex requires a search_field to be specified.'
            )

        responses = []
        for query in queries:
            status_code, response = self._db.query(
                table_name=self._table_name,
                query_field=search_field,
                limit=limit,
                filter=filter if filter is not None else '',
                query_vector=query.tolist(),
                with_distance=True,
            )

            if status_code != HTTPStatus.OK:
                raise IOError(
                    f"Failed to find documents with query {query}. "
                    f"Error code: {status_code}. Error message: {response}."
                )

            results = response['result']
            scores = NdArray._docarray_from_native(
                np.array([result['@distance'] for result in results])
            )
            documents = []
            for result in results:
                doc = copy.copy(result)
                del doc["@distance"]
                documents.append(doc)

            responses.append((documents, scores))

        return _FindResultBatched(
            documents=[r[0] for r in responses],
            scores=[r[1] for r in responses],
        )

    def _filter(
        self,
        filter_query: str,
        limit: int,
    ) -> Union[DocList, List[Dict]]:
        query_batched = [filter_query]
        docs = self._filter_batched(filter_queries=query_batched, limit=limit)
        return docs[0]

    def _filter_batched(
        self,
        filter_queries: str,
        limit: int,
    ) -> Union[List[DocList], List[List[Dict]]]:
        responses = []
        for filter_query in filter_queries:
            status_code, response = self._db.get(
                table_name=self._table_name,
                limit=limit,
                filter=filter_query,
            )

            if status_code != HTTPStatus.OK:
                raise IOError(
                    f"Failed to find documents with filter {filter_query}. "
                    f"Error code: {status_code}. Error message: {response}."
                )

            results = response['result']
            responses.append(results)

        return responses

    def _text_search(
        self,
        query: str,
        limit: int,
        search_field: str = '',
    ) -> _FindResult:
        raise NotImplementedError(f'{type(self)} does not support text search.')

    def _text_search_batched(
        self,
        queries: Sequence[str],
        limit: int,
        search_field: str = '',
    ) -> _FindResultBatched:
        raise NotImplementedError(f'{type(self)} does not support text search.')

`DBConfig` `dataclass`

Bases: DBConfig

Static configuration for EpsillaDocumentIndex

Source code in docarray/index/backends/epsilla.py

@dataclass
class DBConfig(BaseDocIndex.DBConfig):
    """Static configuration for EpsillaDocumentIndex"""

    # default value is the schema type name
    table_name: Optional[str] = None

    # Indicator for self-hosted or cloud version
    is_self_hosted: bool = False

    # self-hosted version uses the following configs
    protocol: Optional[str] = None
    host: Optional[str] = None
    port: Optional[int] = 8888
    db_path: Optional[str] = None
    db_name: Optional[str] = None

    # cloud version uses the following configs
    cloud_project_id: Optional[str] = None
    cloud_db_id: Optional[str] = None
    api_key: Optional[str] = None

    default_column_config: Dict[Any, Dict[str, Any]] = field(
        default_factory=lambda: {
            'TINYINT': {},
            'SMALLINT': {},
            'INT': {},
            'BIGINT': {},
            'FLOAT': {},
            'DOUBLE': {},
            'STRING': {},
            'BOOL': {},
            'JSON': {},
            'VECTOR_FLOAT': {},
        }
    )

    def validate_config(self):
        if self.is_self_hosted:
            self.validate_self_hosted_config()
        else:
            self.validate_cloud_config()

    def validate_self_hosted_config(self):
        missing_attributes = [
            attr
            for attr in ["protocol", "host", "port", "db_path", "db_name"]
            if getattr(self, attr, None) is None
        ]

        if missing_attributes:
            raise ValueError(
                f"Missing required attributes for self-hosted version: {', '.join(missing_attributes)}"
            )

    def validate_cloud_config(self):
        missing_attributes_cloud = [
            attr
            for attr in ["cloud_project_id", "cloud_db_id", "api_key"]
            if getattr(self, attr, None) is None
        ]

        if missing_attributes_cloud:
            raise ValueError(
                f"Missing required attributes for cloud version: {', '.join(missing_attributes_cloud)}"
            )

`Query` `dataclass`

Dataclass describing a query.

Source code in docarray/index/backends/epsilla.py

@dataclass
class Query:
    """Dataclass describing a query."""

    vector_field: Optional[str]
    vector_query: Optional[NdArray]
    filter: Optional[str]
    limit: int

`contains(item)`

Checks if a given document exists in the index.

Parameters:

Name	Type	Description	Default
`item`	`BaseDoc`	The document to check. It must be an instance of BaseDoc or its subclass.	required

Returns:

Type	Description
`bool`	True if the document exists in the index, False otherwise.

Source code in docarray/index/abstract.py

def __contains__(self, item: BaseDoc) -> bool:
    """
    Checks if a given document exists in the index.

    :param item: The document to check.
        It must be an instance of BaseDoc or its subclass.
    :return: True if the document exists in the index, False otherwise.
    """
    if safe_issubclass(type(item), BaseDoc):
        return self._doc_exists(str(item.id))
    else:
        raise TypeError(
            f"item must be an instance of BaseDoc or its subclass, not '{type(item).__name__}'"
        )

`delitem(key)`

Delete one or multiple Documents from the index, by id. If no document is found, a KeyError is raised.

Parameters:

Name	Type	Description	Default
`key`	`Union[str, Sequence[str]]`	id or ids to delete from the Document index	required

Source code in docarray/index/abstract.py

def __delitem__(self, key: Union[str, Sequence[str]]):
    """Delete one or multiple Documents from the index, by `id`.
    If no document is found, a KeyError is raised.

    :param key: id or ids to delete from the Document index
    """
    self._logger.info(f'Deleting documents with id(s) {key} from the index')
    if isinstance(key, str):
        key = [key]

    # delete nested data
    for field_name, type_, _ in self._flatten_schema(
        cast(Type[BaseDoc], self._schema)
    ):
        if safe_issubclass(type_, AnyDocArray):
            for doc_id in key:
                nested_docs_id = self._subindices[field_name]._filter_by_parent_id(
                    doc_id
                )
                if nested_docs_id:
                    del self._subindices[field_name][nested_docs_id]
    # delete data
    self._del_items(key)

`getitem(key)`

Get one or multiple Documents into the index, by id. If no document is found, a KeyError is raised.

Parameters:

Name	Type	Description	Default
`key`	`Union[str, Sequence[str]]`	id or ids to get from the Document index	required

Source code in docarray/index/abstract.py

def __getitem__(
    self, key: Union[str, Sequence[str]]
) -> Union[TSchema, DocList[TSchema]]:
    """Get one or multiple Documents into the index, by `id`.
    If no document is found, a KeyError is raised.

    :param key: id or ids to get from the Document index
    """
    # normalize input
    if isinstance(key, str):
        return_singleton = True
        key = [key]
    else:
        return_singleton = False

    # retrieve data
    doc_sequence = self._get_items(key)

    # check data
    if len(doc_sequence) == 0:
        raise KeyError(f'No document with id {key} found')

    # retrieve nested data
    for field_name, type_, _ in self._flatten_schema(
        cast(Type[BaseDoc], self._schema)
    ):
        if safe_issubclass(type_, AnyDocArray) and isinstance(
            doc_sequence[0], Dict
        ):
            for doc in doc_sequence:
                self._get_subindex_doclist(doc, field_name)  # type: ignore

    # cast output
    if isinstance(doc_sequence, DocList):
        out_docs: DocList[TSchema] = doc_sequence
    elif isinstance(doc_sequence[0], Dict):
        out_docs = self._dict_list_to_docarray(doc_sequence)  # type: ignore
    else:
        docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema))
        out_docs = docs_cls(doc_sequence)

    return out_docs[0] if return_singleton else out_docs

`build_query()`

Build a query for this DocumentIndex.

Returns:

Type	Description
`QueryBuilder`	a new `QueryBuilder` object for this DocumentIndex

Source code in docarray/index/abstract.py

def build_query(self) -> QueryBuilder:
    """
    Build a query for this DocumentIndex.

    :return: a new `QueryBuilder` object for this DocumentIndex
    """
    return self.QueryBuilder()  # type: ignore

`configure(runtime_config=None, **kwargs)`

Configure the DocumentIndex. You can either pass a config object to config or pass individual config parameters as keyword arguments. If a configuration object is passed, it will replace the current configuration. If keyword arguments are passed, they will update the current configuration.

Parameters:

Name	Type	Description	Default
`runtime_config`		the configuration to apply	`None`
`kwargs`		individual configuration parameters	`{}`

Source code in docarray/index/abstract.py

def configure(self, runtime_config=None, **kwargs):
    """
    Configure the DocumentIndex.
    You can either pass a config object to `config` or pass individual config
    parameters as keyword arguments.
    If a configuration object is passed, it will replace the current configuration.
    If keyword arguments are passed, they will update the current configuration.

    :param runtime_config: the configuration to apply
    :param kwargs: individual configuration parameters
    """
    if runtime_config is None:
        self._runtime_config = replace(self._runtime_config, **kwargs)
    else:
        if not isinstance(runtime_config, self.RuntimeConfig):
            raise ValueError(f'runtime_config must be of type {self.RuntimeConfig}')
        self._runtime_config = runtime_config

`filter(filter_query, limit=10, **kwargs)`

Find documents in the index based on a filter query

Parameters:

Name	Type	Description	Default
`filter_query`	`Any`	the DB specific filter query to execute	required
`limit`	`int`	maximum number of documents to return	`10`

Returns:

Type	Description
`DocList`	a DocList containing the documents that match the filter query

Source code in docarray/index/abstract.py

def filter(
    self,
    filter_query: Any,
    limit: int = 10,
    **kwargs,
) -> DocList:
    """Find documents in the index based on a filter query

    :param filter_query: the DB specific filter query to execute
    :param limit: maximum number of documents to return
    :return: a DocList containing the documents that match the filter query
    """
    self._logger.debug(f'Executing `filter` for the query {filter_query}')
    docs = self._filter(filter_query, limit=limit, **kwargs)

    if isinstance(docs, List) and not isinstance(docs, DocList):
        docs = self._dict_list_to_docarray(docs)

    return docs

`filter_batched(filter_queries, limit=10, **kwargs)`

Find documents in the index based on multiple filter queries.

Parameters:

Name	Type	Description	Default
`filter_queries`	`Any`	the DB specific filter query to execute	required
`limit`	`int`	maximum number of documents to return	`10`

Returns:

Type	Description
`List[DocList]`	a DocList containing the documents that match the filter query

Source code in docarray/index/abstract.py

def filter_batched(
    self,
    filter_queries: Any,
    limit: int = 10,
    **kwargs,
) -> List[DocList]:
    """Find documents in the index based on multiple filter queries.

    :param filter_queries: the DB specific filter query to execute
    :param limit: maximum number of documents to return
    :return: a DocList containing the documents that match the filter query
    """
    self._logger.debug(
        f'Executing `filter_batched` for the queries {filter_queries}'
    )
    da_list = self._filter_batched(filter_queries, limit=limit, **kwargs)

    if len(da_list) > 0 and isinstance(da_list[0], List):
        da_list = [self._dict_list_to_docarray(docs) for docs in da_list]

    return da_list  # type: ignore

`filter_subindex(filter_query, subindex, limit=10, **kwargs)`

Find documents in subindex level based on a filter query

Parameters:

Name	Type	Description	Default
`filter_query`	`Any`	the DB specific filter query to execute	required
`subindex`	`str`	name of the subindex to search on	required
`limit`	`int`	maximum number of documents to return	`10`

Returns:

Type	Description
`DocList`	a DocList containing the subindex level documents that match the filter query

Source code in docarray/index/abstract.py

def filter_subindex(
    self,
    filter_query: Any,
    subindex: str,
    limit: int = 10,
    **kwargs,
) -> DocList:
    """Find documents in subindex level based on a filter query

    :param filter_query: the DB specific filter query to execute
    :param subindex: name of the subindex to search on
    :param limit: maximum number of documents to return
    :return: a DocList containing the subindex level documents that match the filter query
    """
    self._logger.debug(
        f'Executing `filter` for the query {filter_query} in subindex {subindex}'
    )
    if '__' in subindex:
        fields = subindex.split('__')
        return self._subindices[fields[0]].filter_subindex(
            filter_query, '__'.join(fields[1:]), limit=limit, **kwargs
        )
    else:
        return self._subindices[subindex].filter(
            filter_query, limit=limit, **kwargs
        )

`find(query, search_field='', limit=10, **kwargs)`

Find documents in the index using nearest neighbor search.

Parameters:

Name	Type	Description	Default
`query`	`Union[AnyTensor, BaseDoc]`	query vector for KNN/ANN search. Can be either a tensor-like (np.array, torch.Tensor, etc.) with a single axis, or a Document	required
`search_field`	`str`	name of the field to search on. Documents in the index are retrieved based on this similarity of this field to the query.	`''`
`limit`	`int`	maximum number of documents to return	`10`

Returns:

Type	Description
`FindResult`	a named tuple containing `documents` and `scores`

Source code in docarray/index/abstract.py

def find(
    self,
    query: Union[AnyTensor, BaseDoc],
    search_field: str = '',
    limit: int = 10,
    **kwargs,
) -> FindResult:
    """Find documents in the index using nearest neighbor search.

    :param query: query vector for KNN/ANN search.
        Can be either a tensor-like (np.array, torch.Tensor, etc.)
        with a single axis, or a Document
    :param search_field: name of the field to search on.
        Documents in the index are retrieved based on this similarity
        of this field to the query.
    :param limit: maximum number of documents to return
    :return: a named tuple containing `documents` and `scores`
    """
    self._logger.debug(f'Executing `find` for search field {search_field}')

    self._validate_search_field(search_field)
    if isinstance(query, BaseDoc):
        query_vec = self._get_values_by_column([query], search_field)[0]
    else:
        query_vec = query
    query_vec_np = self._to_numpy(query_vec)
    docs, scores = self._find(
        query_vec_np, search_field=search_field, limit=limit, **kwargs
    )

    if isinstance(docs, List) and not isinstance(docs, DocList):
        docs = self._dict_list_to_docarray(docs)

    return FindResult(documents=docs, scores=scores)

`find_batched(queries, search_field='', limit=10, **kwargs)`

Find documents in the index using nearest neighbor search.

Parameters:

Name	Type	Description	Default
`queries`	`Union[AnyTensor, DocList]`	query vector for KNN/ANN search. Can be either a tensor-like (np.array, torch.Tensor, etc.) with a, or a DocList. If a tensor-like is passed, it should have shape (batch_size, vector_dim)	required
`search_field`	`str`	name of the field to search on. Documents in the index are retrieved based on this similarity of this field to the query.	`''`
`limit`	`int`	maximum number of documents to return per query	`10`

Returns:

Type	Description
`FindResultBatched`	a named tuple containing `documents` and `scores`

Source code in docarray/index/abstract.py

def find_batched(
    self,
    queries: Union[AnyTensor, DocList],
    search_field: str = '',
    limit: int = 10,
    **kwargs,
) -> FindResultBatched:
    """Find documents in the index using nearest neighbor search.

    :param queries: query vector for KNN/ANN search.
        Can be either a tensor-like (np.array, torch.Tensor, etc.) with a,
        or a DocList.
        If a tensor-like is passed, it should have shape (batch_size, vector_dim)
    :param search_field: name of the field to search on.
        Documents in the index are retrieved based on this similarity
        of this field to the query.
    :param limit: maximum number of documents to return per query
    :return: a named tuple containing `documents` and `scores`
    """
    self._logger.debug(f'Executing `find_batched` for search field {search_field}')

    if search_field:
        if '__' in search_field:
            fields = search_field.split('__')
            if safe_issubclass(self._schema._get_field_annotation(fields[0]), AnyDocArray):  # type: ignore
                return self._subindices[fields[0]].find_batched(
                    queries,
                    search_field='__'.join(fields[1:]),
                    limit=limit,
                    **kwargs,
                )

    self._validate_search_field(search_field)
    if isinstance(queries, Sequence):
        query_vec_list = self._get_values_by_column(queries, search_field)
        query_vec_np = np.stack(
            tuple(self._to_numpy(query_vec) for query_vec in query_vec_list)
        )
    else:
        query_vec_np = self._to_numpy(queries)

    da_list, scores = self._find_batched(
        query_vec_np, search_field=search_field, limit=limit, **kwargs
    )
    if (
        len(da_list) > 0
        and isinstance(da_list[0], List)
        and not isinstance(da_list[0], DocList)
    ):
        da_list = [self._dict_list_to_docarray(docs) for docs in da_list]

    return FindResultBatched(documents=da_list, scores=scores)  # type: ignore

`find_subindex(query, subindex='', search_field='', limit=10, **kwargs)`

Find documents in subindex level.

Parameters:

Name	Type	Description	Default
`query`	`Union[AnyTensor, BaseDoc]`	query vector for KNN/ANN search. Can be either a tensor-like (np.array, torch.Tensor, etc.) with a single axis, or a Document	required
`subindex`	`str`	name of the subindex to search on	`''`
`search_field`	`str`	name of the field to search on	`''`
`limit`	`int`	maximum number of documents to return	`10`

Returns:

Type	Description
`SubindexFindResult`	a named tuple containing root docs, subindex docs and scores

Source code in docarray/index/abstract.py

def find_subindex(
    self,
    query: Union[AnyTensor, BaseDoc],
    subindex: str = '',
    search_field: str = '',
    limit: int = 10,
    **kwargs,
) -> SubindexFindResult:
    """Find documents in subindex level.

    :param query: query vector for KNN/ANN search.
        Can be either a tensor-like (np.array, torch.Tensor, etc.)
        with a single axis, or a Document
    :param subindex: name of the subindex to search on
    :param search_field: name of the field to search on
    :param limit: maximum number of documents to return
    :return: a named tuple containing root docs, subindex docs and scores
    """
    self._logger.debug(f'Executing `find_subindex` for search field {search_field}')

    sub_docs, scores = self._find_subdocs(
        query, subindex=subindex, search_field=search_field, limit=limit, **kwargs
    )

    fields = subindex.split('__')
    root_ids = [
        self._get_root_doc_id(doc.id, fields[0], '__'.join(fields[1:]))
        for doc in sub_docs
    ]
    root_docs = DocList[self._schema]()  # type: ignore
    for id in root_ids:
        root_docs.append(self[id])

    return SubindexFindResult(
        root_documents=root_docs, sub_documents=sub_docs, scores=scores  # type: ignore
    )

`index(docs, **kwargs)`

index Documents into the index.

Note

Passing a sequence of Documents that is not a DocList (such as a List of Docs) comes at a performance penalty. This is because the Index needs to check compatibility between itself and the data. With a DocList as input this is a single check; for other inputs compatibility needs to be checked for every Document individually.

Parameters:

Name	Type	Description	Default
`docs`	`Union[BaseDoc, Sequence[BaseDoc]]`	Documents to index.	required

Source code in docarray/index/abstract.py

def index(self, docs: Union[BaseDoc, Sequence[BaseDoc]], **kwargs):
    """index Documents into the index.

    !!! note
        Passing a sequence of Documents that is not a DocList
        (such as a List of Docs) comes at a performance penalty.
        This is because the Index needs to check compatibility between itself and
        the data. With a DocList as input this is a single check; for other inputs
        compatibility needs to be checked for every Document individually.

    :param docs: Documents to index.
    """
    n_docs = 1 if isinstance(docs, BaseDoc) else len(docs)
    self._logger.debug(f'Indexing {n_docs} documents')
    docs_validated = self._validate_docs(docs)
    self._update_subindex_data(docs_validated)
    data_by_columns = self._get_col_value_dict(docs_validated)
    self._index(data_by_columns, **kwargs)

`subindex_contains(item)`

Checks if a given BaseDoc item is contained in the index or any of its subindices.

Parameters:

Name	Type	Description	Default
`item`	`BaseDoc`	the given BaseDoc	required

Returns:

Type	Description
`bool`	if the given BaseDoc item is contained in the index/subindices

Source code in docarray/index/abstract.py

def subindex_contains(self, item: BaseDoc) -> bool:
    """Checks if a given BaseDoc item is contained in the index or any of its subindices.

    :param item: the given BaseDoc
    :return: if the given BaseDoc item is contained in the index/subindices
    """
    if self._is_index_empty:
        return False

    if safe_issubclass(type(item), BaseDoc):
        return self.__contains__(item) or any(
            index.subindex_contains(item) for index in self._subindices.values()
        )
    else:
        raise TypeError(
            f"item must be an instance of BaseDoc or its subclass, not '{type(item).__name__}'"
        )

`text_search(query, search_field='', limit=10, **kwargs)`

Find documents in the index based on a text search query.

Parameters:

Name	Type	Description	Default
`query`	`Union[str, BaseDoc]`	The text to search for	required
`search_field`	`str`	name of the field to search on	`''`
`limit`	`int`	maximum number of documents to return	`10`

Returns:

Type	Description
`FindResult`	a named tuple containing `documents` and `scores`

Source code in docarray/index/abstract.py

def text_search(
    self,
    query: Union[str, BaseDoc],
    search_field: str = '',
    limit: int = 10,
    **kwargs,
) -> FindResult:
    """Find documents in the index based on a text search query.

    :param query: The text to search for
    :param search_field: name of the field to search on
    :param limit: maximum number of documents to return
    :return: a named tuple containing `documents` and `scores`
    """
    self._logger.debug(f'Executing `text_search` for search field {search_field}')
    self._validate_search_field(search_field)
    if isinstance(query, BaseDoc):
        query_text = self._get_values_by_column([query], search_field)[0]
    else:
        query_text = query
    docs, scores = self._text_search(
        query_text, search_field=search_field, limit=limit, **kwargs
    )

    if isinstance(docs, List) and not isinstance(docs, DocList):
        docs = self._dict_list_to_docarray(docs)

    return FindResult(documents=docs, scores=scores)

`text_search_batched(queries, search_field='', limit=10, **kwargs)`

Find documents in the index based on a text search query.

Parameters:

Name	Type	Description	Default
`queries`	`Union[Sequence[str], Sequence[BaseDoc]]`	The texts to search for	required
`search_field`	`str`	name of the field to search on	`''`
`limit`	`int`	maximum number of documents to return	`10`

Returns:

Type	Description
`FindResultBatched`	a named tuple containing `documents` and `scores`

Source code in docarray/index/abstract.py

def text_search_batched(
    self,
    queries: Union[Sequence[str], Sequence[BaseDoc]],
    search_field: str = '',
    limit: int = 10,
    **kwargs,
) -> FindResultBatched:
    """Find documents in the index based on a text search query.

    :param queries: The texts to search for
    :param search_field: name of the field to search on
    :param limit: maximum number of documents to return
    :return: a named tuple containing `documents` and `scores`
    """
    self._logger.debug(
        f'Executing `text_search_batched` for search field {search_field}'
    )
    self._validate_search_field(search_field)
    if isinstance(queries[0], BaseDoc):
        query_docs: Sequence[BaseDoc] = cast(Sequence[BaseDoc], queries)
        query_texts: Sequence[str] = self._get_values_by_column(
            query_docs, search_field
        )
    else:
        query_texts = cast(Sequence[str], queries)
    da_list, scores = self._text_search_batched(
        query_texts, search_field=search_field, limit=limit, **kwargs
    )

    if len(da_list) > 0 and isinstance(da_list[0], List):
        docs = [self._dict_list_to_docarray(docs) for docs in da_list]
        return FindResultBatched(documents=docs, scores=scores)

    da_list_ = cast(List[DocList], da_list)
    return FindResultBatched(documents=da_list_, scores=scores)

EpsillaDocumentIndex

docarray.index.backends.epsilla.EpsillaDocumentIndex

DBConfig dataclass

Query dataclass

__contains__(item)

__delitem__(key)

__getitem__(key)

build_query()

configure(runtime_config=None, **kwargs)

filter(filter_query, limit=10, **kwargs)

filter_batched(filter_queries, limit=10, **kwargs)

filter_subindex(filter_query, subindex, limit=10, **kwargs)

find(query, search_field='', limit=10, **kwargs)

find_batched(queries, search_field='', limit=10, **kwargs)

find_subindex(query, subindex='', search_field='', limit=10, **kwargs)

index(docs, **kwargs)

subindex_contains(item)

text_search(query, search_field='', limit=10, **kwargs)

text_search_batched(queries, search_field='', limit=10, **kwargs)

`docarray.index.backends.epsilla.EpsillaDocumentIndex`

`DBConfig` `dataclass`

`Query` `dataclass`

`contains(item)`

`delitem(key)`

`getitem(key)`

`build_query()`

`configure(runtime_config=None, **kwargs)`

`filter(filter_query, limit=10, **kwargs)`

`filter_batched(filter_queries, limit=10, **kwargs)`

`filter_subindex(filter_query, subindex, limit=10, **kwargs)`

`find(query, search_field='', limit=10, **kwargs)`

`find_batched(queries, search_field='', limit=10, **kwargs)`

`find_subindex(query, subindex='', search_field='', limit=10, **kwargs)`

`index(docs, **kwargs)`

`subindex_contains(item)`

`text_search(query, search_field='', limit=10, **kwargs)`

`text_search_batched(queries, search_field='', limit=10, **kwargs)`