WeaviateDocumentIndex

`docarray.index.backends.weaviate.WeaviateDocumentIndex`

Bases: BaseDocIndex, Generic[TSchema]

Source code in docarray/index/backends/weaviate.py

class WeaviateDocumentIndex(BaseDocIndex, Generic[TSchema]):
    def __init__(self, db_config=None, **kwargs) -> None:
        """Initialize WeaviateDocumentIndex"""

        self.embedding_column: Optional[str] = None
        self.properties: Optional[List[str]] = None
        # keep track of the column name that contains the bytes
        # type because we will store them as a base64 encoded string
        # in weaviate
        self.bytes_columns: List[str] = []
        # keep track of the array columns that are not embeddings because we will
        # convert them to python lists before uploading to weaviate
        self.nonembedding_array_columns: List[str] = []
        super().__init__(db_config=db_config, **kwargs)
        self._db_config: WeaviateDocumentIndex.DBConfig = cast(
            WeaviateDocumentIndex.DBConfig, self._db_config
        )
        self._runtime_config: WeaviateDocumentIndex.RuntimeConfig = cast(
            WeaviateDocumentIndex.RuntimeConfig, self._runtime_config
        )

        if self._db_config.embedded_options:
            self._client = weaviate.Client(
                embedded_options=self._db_config.embedded_options
            )
        else:
            self._client = weaviate.Client(
                self._db_config.host, auth_client_secret=self._build_auth_credentials()
            )

        self._configure_client()
        self._validate_columns()
        self._set_embedding_column()
        self._set_properties()
        self._create_schema()

    @property
    def index_name(self):
        default_index_name = self._schema.__name__ if self._schema is not None else None
        if default_index_name is None:
            raise ValueError(
                'A WeaviateDocumentIndex must be typed with a Document type.'
                'To do so, use the syntax: WeaviateDocumentIndex[DocumentType]'
            )

        return self._db_config.index_name or default_index_name

    def _set_properties(self) -> None:
        field_overwrites = {"id": DOCUMENTID}

        self.properties = [
            field_overwrites.get(k, k)
            for k, v in self._column_infos.items()
            if v.config.get('is_embedding', False) is False
            and not safe_issubclass(v.docarray_type, AnyDocArray)
        ]

    def _validate_columns(self) -> None:
        # must have at most one column with property is_embedding=True
        # and that column must be of type WEAVIATE_PY_VEC_TYPES
        # TODO: update when https://github.com/weaviate/weaviate/issues/2424
        # is implemented and discuss best interface to signal which column(s)
        # should be used for embeddings
        num_embedding_columns = 0

        for column_name, column_info in self._column_infos.items():
            if column_info.config.get('is_embedding', False):
                num_embedding_columns += 1
                # if db_type is not 'number[]', then that means the type of the column in
                # the given schema is not one of WEAVIATE_PY_VEC_TYPES
                # note: the mapping between a column's type in the schema to a weaviate type
                # is handled by the python_type_to_db_type method
                if column_info.db_type != 'number[]':
                    raise ValueError(
                        f'Column {column_name} is marked as embedding but is not of type {WEAVIATE_PY_VEC_TYPES}'
                    )

        if num_embedding_columns > 1:
            raise ValueError(
                f'Only one column can be marked as embedding but found {num_embedding_columns} columns marked as embedding'
            )

    def _set_embedding_column(self) -> None:
        for column_name, column_info in self._column_infos.items():
            if column_info.config.get('is_embedding', False):
                self.embedding_column = column_name
                break

    def _configure_client(self) -> None:
        self._client.batch.configure(**self._runtime_config.batch_config)

    def _build_auth_credentials(self):
        dbconfig = self._db_config

        if dbconfig.auth_api_key:
            return weaviate.auth.AuthApiKey(api_key=dbconfig.auth_api_key)
        elif dbconfig.username and dbconfig.password:
            return weaviate.auth.AuthClientPassword(
                dbconfig.username, dbconfig.password, dbconfig.scopes
            )
        else:
            return None

    def configure(self, runtime_config=None, **kwargs) -> None:
        """
        Configure the WeaviateDocumentIndex.
        You can either pass a config object to `config` or pass individual config
        parameters as keyword arguments.
        If a configuration object is passed, it will replace the current configuration.
        If keyword arguments are passed, they will update the current configuration.

        :param runtime_config: the configuration to apply
        :param kwargs: individual configuration parameters
        """
        super().configure(runtime_config, **kwargs)
        self._configure_client()

    def _create_schema(self) -> None:
        schema: Dict[str, Any] = {}

        properties = []
        column_infos = self._column_infos

        for column_name, column_info in column_infos.items():
            # in weaviate, we do not create a property for the doc's embeddings
            if safe_issubclass(column_info.docarray_type, AnyDocArray):
                continue
            if column_name == self.embedding_column:
                continue
            if column_info.db_type == 'blob':
                self.bytes_columns.append(column_name)
            if column_info.db_type == 'number[]':
                self.nonembedding_array_columns.append(column_name)
            prop = {
                "name": column_name
                if column_name != 'id'
                else DOCUMENTID,  # in weaviate, id and _id is a reserved keyword
                "dataType": [column_info.db_type],
            }
            properties.append(prop)

        # TODO: What is the best way to specify other config that is part of schema?
        # e.g. invertedIndexConfig, shardingConfig, moduleConfig, vectorIndexConfig
        #       and configure replication
        # we will update base on user feedback
        schema["properties"] = properties
        schema["class"] = self.index_name

        if self._client.schema.exists(self.index_name):
            logging.warning(
                f"Found index {self.index_name} with schema {schema}. Will reuse existing schema."
            )
        else:
            self._client.schema.create_class(schema)

    @dataclass
    class DBConfig(BaseDocIndex.DBConfig):
        """Dataclass that contains all "static" configurations of WeaviateDocumentIndex."""

        host: str = 'http://localhost:8080'
        index_name: Optional[str] = None
        username: Optional[str] = None
        password: Optional[str] = None
        scopes: List[str] = field(default_factory=lambda: ["offline_access"])
        auth_api_key: Optional[str] = None
        embedded_options: Optional[EmbeddedOptions] = None
        default_column_config: Dict[Any, Dict[str, Any]] = field(
            default_factory=lambda: {
                np.ndarray: {},
                docarray.typing.ID: {},
                'string': {},
                'text': {},
                'int': {},
                'number': {},
                'boolean': {},
                'number[]': {},
                'blob': {},
            }
        )

        def __post_init__(self):
            # To prevent errors, it is important to capitalize the provided index name
            # when working with Weaviate, as it stores index names in a capitalized format.
            # Can't use .capitalize() because it modifies the whole string (See test).
            self.index_name = (
                self.index_name[0].upper() + self.index_name[1:]
                if self.index_name
                else None
            )

    @dataclass
    class RuntimeConfig(BaseDocIndex.RuntimeConfig):
        """Dataclass that contains all "dynamic" configurations of WeaviateDocumentIndex."""

        batch_config: Dict[str, Any] = field(
            default_factory=lambda: DEFAULT_BATCH_CONFIG
        )

    def _del_items(self, doc_ids: Sequence[str]):
        has_matches = True

        operands = [
            {"path": [DOCUMENTID], "operator": "Equal", "valueString": doc_id}
            for doc_id in doc_ids
        ]
        where_filter = {
            "operator": "Or",
            "operands": operands,
        }

        # do a loop because there is a limit to how many objects can be deleted at
        # in a single query
        # see: https://weaviate.io/developers/weaviate/api/rest/batch#maximum-number-of-deletes-per-query
        while has_matches:
            results = self._client.batch.delete_objects(
                class_name=self.index_name,
                where=where_filter,
            )

            has_matches = results["results"]["matches"]

    def _filter(self, filter_query: Any, limit: int) -> Union[DocList, List[Dict]]:
        self._overwrite_id(filter_query)

        results = (
            self._client.query.get(self.index_name, self.properties)
            .with_additional("vector")
            .with_where(filter_query)
            .with_limit(limit)
            .do()
        )

        docs = results["data"]["Get"][self.index_name]

        return [self._parse_weaviate_result(doc) for doc in docs]

    def _filter_batched(
        self, filter_queries: Any, limit: int
    ) -> Union[List[DocList], List[List[Dict]]]:
        for filter_query in filter_queries:
            self._overwrite_id(filter_query)

        qs = [
            self._client.query.get(self.index_name, self.properties)
            .with_additional("vector")
            .with_where(filter_query)
            .with_limit(limit)
            .with_alias(f'query_{i}')
            for i, filter_query in enumerate(filter_queries)
        ]

        batched_results = self._client.query.multi_get(qs).do()

        return [
            [self._parse_weaviate_result(doc) for doc in batched_result]
            for batched_result in batched_results["data"]["Get"].values()
        ]

    def find(
        self,
        query: Union[AnyTensor, BaseDoc],
        search_field: str = '',
        limit: int = 10,
        **kwargs,
    ):
        """
        Find k-nearest neighbors of the query.

        :param query: query vector for KNN/ANN search. Has single axis.
        :param search_field: name of the field to search on
        :param limit: maximum number of documents to return per query
        :return: a named tuple containing `documents` and `scores`
        """
        self._logger.debug('Executing `find`')
        if search_field != '':
            raise ValueError(
                'Argument search_field is not supported for WeaviateDocumentIndex.\nSet search_field to an empty string to proceed.'
            )
        embedding_field = self._get_embedding_field()
        if isinstance(query, BaseDoc):
            query_vec = self._get_values_by_column([query], embedding_field)[0]
        else:
            query_vec = query
        query_vec_np = self._to_numpy(query_vec)
        docs, scores = self._find(
            query_vec_np, search_field=search_field, limit=limit, **kwargs
        )

        if isinstance(docs, List) and not isinstance(docs, DocList):
            docs = self._dict_list_to_docarray(docs)

        return FindResult(documents=docs, scores=scores)

    def _overwrite_id(self, where_filter):
        """
        Overwrite the id field in the where filter to DOCUMENTID
        if the "id" field is present in the path
        """
        for key, value in where_filter.items():
            if key == "path" and value == ["id"]:
                where_filter[key] = [DOCUMENTID]
            elif isinstance(value, dict):
                self._overwrite_id(value)
            elif isinstance(value, list):
                for item in value:
                    if isinstance(item, dict):
                        self._overwrite_id(item)

    def _find(
        self,
        query: np.ndarray,
        limit: int,
        search_field: str = '',
        score_name: Literal["certainty", "distance"] = "certainty",
        score_threshold: Optional[float] = None,
    ) -> _FindResult:
        index_name = self.index_name
        if search_field:
            logging.warning(
                'The search_field argument is not supported for the WeaviateDocumentIndex and will be ignored.'
            )
        near_vector: Dict[str, Any] = {
            "vector": query,
        }
        if score_threshold:
            near_vector[score_name] = score_threshold

        results = (
            self._client.query.get(index_name, self.properties)
            .with_near_vector(
                near_vector,
            )
            .with_limit(limit)
            .with_additional([score_name, "vector"])
            .do()
        )

        docs, scores = self._format_response(
            results["data"]["Get"][index_name], score_name
        )
        return _FindResult(docs, parse_obj_as(NdArray, scores))

    def _format_response(
        self, results, score_name
    ) -> Tuple[List[Dict[Any, Any]], List[Any]]:
        """
        Format the response from Weaviate into a Tuple of DocList and scores
        """

        documents = []
        scores = []

        for result in results:
            score = result["_additional"][score_name]
            scores.append(score)

            document = self._parse_weaviate_result(result)
            documents.append(document)

        return documents, scores

    def find_batched(
        self,
        queries: Union[AnyTensor, DocList],
        search_field: str = '',
        limit: int = 10,
        **kwargs: Any,
    ) -> FindResultBatched:
        """Find documents in the index using nearest neighbor search.

        :param queries: query vector for KNN/ANN search.
            Can be either a tensor-like (np.array, torch.Tensor, etc.) with a,
            or a DocList.
            If a tensor-like is passed, it should have shape (batch_size, vector_dim)
        :param search_field: name of the field to search on.
            Documents in the index are retrieved based on this similarity
            of this field to the query.
        :param limit: maximum number of documents to return per query
        :return: a named tuple containing `documents` and `scores`
        """
        self._logger.debug('Executing `find_batched`')
        if search_field != '':
            raise ValueError(
                'Argument search_field is not supported for WeaviateDocumentIndex.\nSet search_field to an empty string to proceed.'
            )
        embedding_field = self._get_embedding_field()

        if isinstance(queries, Sequence):
            query_vec_list = self._get_values_by_column(queries, embedding_field)
            query_vec_np = np.stack(
                tuple(self._to_numpy(query_vec) for query_vec in query_vec_list)
            )
        else:
            query_vec_np = self._to_numpy(queries)

        da_list, scores = self._find_batched(
            query_vec_np, search_field=search_field, limit=limit, **kwargs
        )

        if len(da_list) > 0 and isinstance(da_list[0], List):
            da_list = [self._dict_list_to_docarray(docs) for docs in da_list]

        return FindResultBatched(documents=da_list, scores=scores)  # type: ignore

    def _find_batched(
        self,
        queries: np.ndarray,
        limit: int,
        search_field: str = '',
        score_name: Literal["certainty", "distance"] = "certainty",
        score_threshold: Optional[float] = None,
    ) -> _FindResultBatched:
        qs = []
        for i, query in enumerate(queries):
            near_vector: Dict[str, Any] = {"vector": query}

            if score_threshold:
                near_vector[score_name] = score_threshold

            q = (
                self._client.query.get(self.index_name, self.properties)
                .with_near_vector(near_vector)
                .with_limit(limit)
                .with_additional([score_name, "vector"])
                .with_alias(f'query_{i}')
            )

            qs.append(q)

        results = self._client.query.multi_get(qs).do()

        docs_and_scores = [
            self._format_response(result, score_name)
            for result in results["data"]["Get"].values()
        ]

        docs, scores = zip(*docs_and_scores)
        return _FindResultBatched(list(docs), list(scores))

    def _get_items(self, doc_ids: Sequence[str]) -> List[Dict]:
        # TODO: warn when doc_ids > QUERY_MAXIMUM_RESULTS after
        #       https://github.com/weaviate/weaviate/issues/2792
        #       is implemented
        operands = [
            {"path": [DOCUMENTID], "operator": "Equal", "valueString": doc_id}
            for doc_id in doc_ids
        ]
        where_filter = {
            "operator": "Or",
            "operands": operands,
        }

        results = (
            self._client.query.get(self.index_name, self.properties)
            .with_where(where_filter)
            .with_additional("vector")
            .do()
        )

        docs = [
            self._parse_weaviate_result(doc)
            for doc in results["data"]["Get"][self.index_name]
        ]

        return docs

    def _rewrite_documentid(self, document: Dict):
        doc = document.copy()

        # rewrite the id to DOCUMENTID
        document_id = doc.pop('id')
        doc[DOCUMENTID] = document_id

        return doc

    def _parse_weaviate_result(self, result: Dict) -> Dict:
        """
        Parse the result from weaviate to a format that is compatible with the schema
        that was used to initialize weaviate with.
        """

        result = result.copy()

        # rewrite the DOCUMENTID to id
        if DOCUMENTID in result:
            result['id'] = result.pop(DOCUMENTID)

        # take the vector from the _additional field
        if '_additional' in result and self.embedding_column:
            additional_fields = result.pop('_additional')
            if 'vector' in additional_fields:
                result[self.embedding_column] = additional_fields['vector']

        # convert any base64 encoded bytes column to bytes
        self._decode_base64_properties_to_bytes(result)

        return result

    def _index(self, column_to_data: Dict[str, Generator[Any, None, None]]):
        self._index_subindex(column_to_data)

        docs = self._transpose_col_value_dict(column_to_data)
        index_name = self.index_name

        with self._client.batch as batch:
            for doc in docs:
                parsed_doc = self._rewrite_documentid(doc)
                self._encode_bytes_columns_to_base64(parsed_doc)
                self._convert_nonembedding_array_to_list(parsed_doc)
                vector = (
                    parsed_doc.pop(self.embedding_column)
                    if self.embedding_column
                    else None
                )

                batch.add_data_object(
                    uuid=weaviate.util.generate_uuid5(parsed_doc, index_name),
                    data_object=parsed_doc,
                    class_name=index_name,
                    vector=vector,
                )

    def _text_search(
        self, query: str, limit: int, search_field: str = ''
    ) -> _FindResult:
        index_name = self.index_name
        bm25 = {"query": query, "properties": [search_field]}

        results = (
            self._client.query.get(index_name, self.properties)
            .with_bm25(**bm25)
            .with_limit(limit)
            .with_additional(["score", "vector"])
            .do()
        )

        docs, scores = self._format_response(
            results["data"]["Get"][index_name], "score"
        )

        return _FindResult(documents=docs, scores=parse_obj_as(NdArray, scores))

    def _text_search_batched(
        self, queries: Sequence[str], limit: int, search_field: str = ''
    ) -> _FindResultBatched:
        qs = []
        for i, query in enumerate(queries):
            bm25 = {"query": query, "properties": [search_field]}

            q = (
                self._client.query.get(self.index_name, self.properties)
                .with_bm25(**bm25)
                .with_limit(limit)
                .with_additional(["score", "vector"])
                .with_alias(f'query_{i}')
            )

            qs.append(q)

        results = self._client.query.multi_get(qs).do()

        docs_and_scores = [
            self._format_response(result, "score")
            for result in results["data"]["Get"].values()
        ]

        docs, scores = zip(*docs_and_scores)
        return _FindResultBatched(list(docs), list(scores))

    def execute_query(self, query: Any, *args, **kwargs) -> Any:
        """
        Execute a query on the WeaviateDocumentIndex.

        Can take two kinds of inputs:

        1. A native query of the underlying database. This is meant as a passthrough so that you
        can enjoy any functionality that is not available through the Document index API.
        2. The output of this Document index' `QueryBuilder.build()` method.

        :param query: the query to execute
        :param args: positional arguments to pass to the query
        :param kwargs: keyword arguments to pass to the query
        :return: the result of the query
        """
        da_class = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema))

        if isinstance(query, self.QueryBuilder):
            batched_results = self._client.query.multi_get(query._queries).do()
            batched_docs = batched_results["data"]["Get"].values()

            def f(doc):
                # TODO: use
                # return self._schema(**self._parse_weaviate_result(doc))
                # when https://github.com/weaviate/weaviate/issues/2858
                # is fixed
                return self._schema.from_view(self._parse_weaviate_result(doc))  # type: ignore

            results = [
                da_class([f(doc) for doc in batched_doc])
                for batched_doc in batched_docs
            ]
            return results if len(results) > 1 else results[0]

        # TODO: validate graphql query string before sending it to weaviate
        if isinstance(query, str):
            return self._client.query.raw(query)

    def num_docs(self) -> int:
        """
        Get the number of documents.
        """
        index_name = self.index_name
        result = self._client.query.aggregate(index_name).with_meta_count().do()
        # TODO: decorator to check for errors
        total_docs = result["data"]["Aggregate"][index_name][0]["meta"]["count"]

        return total_docs

    def python_type_to_db_type(self, python_type: Type) -> Any:
        """Map python type to database type.
        Takes any python type and returns the corresponding database column type.

        :param python_type: a python type.
        :return: the corresponding database column type,
            or None if ``python_type`` is not supported.
        """
        for allowed_type in WEAVIATE_PY_VEC_TYPES:
            if safe_issubclass(python_type, allowed_type):
                return 'number[]'

        py_weaviate_type_map = {
            docarray.typing.ID: 'string',
            str: 'text',
            int: 'int',
            float: 'number',
            bool: 'boolean',
            np.ndarray: 'number[]',
            bytes: 'blob',
        }

        for py_type, weaviate_type in py_weaviate_type_map.items():
            if safe_issubclass(python_type, py_type):
                return weaviate_type

        raise ValueError(f'Unsupported column type for {type(self)}: {python_type}')

    def build_query(self) -> BaseDocIndex.QueryBuilder:
        """
        Build a query for WeaviateDocumentIndex.
        :return: QueryBuilder object
        """
        return self.QueryBuilder(self)

    def _get_embedding_field(self):
        for colname, colinfo in self._column_infos.items():
            # no need to check for missing is_embedding attribute because this check
            # is done when the index is created
            if colinfo.config.get('is_embedding', None):
                return colname

        # just to pass mypy
        return ""

    def _encode_bytes_columns_to_base64(self, doc):
        for column in self.bytes_columns:
            if doc[column] is not None:
                doc[column] = base64.b64encode(doc[column]).decode("utf-8")

    def _decode_base64_properties_to_bytes(self, doc):
        for column in self.bytes_columns:
            if doc[column] is not None:
                doc[column] = base64.b64decode(doc[column])

    def _convert_nonembedding_array_to_list(self, doc):
        for column in self.nonembedding_array_columns:
            if doc[column] is not None:
                doc[column] = doc[column].tolist()

    def _filter_by_parent_id(self, id: str) -> Optional[List[str]]:
        results = (
            self._client.query.get(self._db_config.index_name, ['docarrayid'])
            .with_where(
                {'path': ['parent_id'], 'operator': 'Equal', 'valueString': f'{id}'}
            )
            .do()
        )

        ids = [
            res['docarrayid']
            for res in results['data']['Get'][self._db_config.index_name]
        ]
        return ids

    def _doc_exists(self, doc_id: str) -> bool:
        result = (
            self._client.query.get(self.index_name, ['docarrayid'])
            .with_where(
                {
                    "path": ['docarrayid'],
                    "operator": "Equal",
                    "valueString": f'{doc_id}',
                }
            )
            .do()
        )
        docs = result["data"]["Get"][self.index_name]
        return docs is not None and len(docs) > 0

    class QueryBuilder(BaseDocIndex.QueryBuilder):
        def __init__(self, document_index):
            self._queries = [
                document_index._client.query.get(
                    document_index.index_name, document_index.properties
                )
            ]

        def build(self, *args, **kwargs) -> Any:
            """Build the query object."""
            num_queries = len(self._queries)

            for i in range(num_queries):
                q = self._queries[i]
                if self._is_hybrid_query(q):
                    self._make_proper_hybrid_query(q)
                q.with_additional(["vector"]).with_alias(f'query_{i}')

            return self

        def _is_hybrid_query(self, query: weaviate.gql.get.GetBuilder) -> bool:
            """
            Checks if a query has been composed with both a with_bm25 and a with_near_vector verb
            """
            if not query._near_ask:
                return False
            else:
                return query._bm25 and query._near_ask._content.get("vector", None)

        def _make_proper_hybrid_query(
            self, query: weaviate.gql.get.GetBuilder
        ) -> weaviate.gql.get.GetBuilder:
            """
            Modifies a query to be a proper hybrid query.

            In weaviate, a query with with_bm25 and with_near_vector verb is not a hybrid query.
            We need to use the with_hybrid verb to make it a hybrid query.
            """

            text_query = query._bm25.query
            vector_query = query._near_ask._content["vector"]
            hybrid_query = weaviate.gql.get.Hybrid(
                query=text_query, vector=vector_query, alpha=0.5
            )

            query._bm25 = None
            query._near_ask = None
            query._hybrid = hybrid_query

        def _overwrite_id(self, where_filter):
            """
            Overwrite the id field in the where filter to DOCUMENTID
            if the "id" field is present in the path
            """
            for key, value in where_filter.items():
                if key == "path" and value == ["id"]:
                    where_filter[key] = [DOCUMENTID]
                elif isinstance(value, dict):
                    self._overwrite_id(value)
                elif isinstance(value, list):
                    for item in value:
                        if isinstance(item, dict):
                            self._overwrite_id(item)

        def find(
            self,
            query,
            score_name: Literal["certainty", "distance"] = "certainty",
            score_threshold: Optional[float] = None,
            **kwargs,
        ) -> Any:
            """
            Find k-nearest neighbors of the query.

            :param query: query vector for search. Has single axis.
            :param score_name: either `"certainty"` (default) or `"distance"`
            :param score_threshold: the threshold of the score
            :return: self
            """
            if kwargs.get('search_field'):
                logging.warning(
                    'The search_field argument is not supported for the WeaviateDocumentIndex and will be ignored.'
                )

            near_vector = {
                "vector": query,
            }
            if score_threshold:
                near_vector[score_name] = score_threshold

            self._queries[0] = self._queries[0].with_near_vector(near_vector)
            return self

        def find_batched(
            self,
            queries,
            score_name: Literal["certainty", "distance"] = "certainty",
            score_threshold: Optional[float] = None,
        ) -> Any:
            """Find k-nearest neighbors of the query vectors.

            :param queries: query vector for KNN/ANN search.
                Can be either a tensor-like (np.array, torch.Tensor, etc.) with a,
                or a DocList.
                If a tensor-like is passed, it should have shape `(batch_size, vector_dim)`
            :param score_name: either `"certainty"` (default) or `"distance"`
            :param score_threshold: the threshold of the score
            :return: self
            """
            adj_queries, adj_clauses = self._resize_queries_and_clauses(
                self._queries, queries
            )
            new_queries = []

            for query, clause in zip(adj_queries, adj_clauses):
                near_vector = {
                    "vector": clause,
                }
                if score_threshold:
                    near_vector[score_name] = score_threshold

                new_queries.append(query.with_near_vector(near_vector))

            self._queries = new_queries

            return self

        def filter(self, where_filter: Any) -> Any:
            """Find documents in the index based on a filter query
            :param where_filter: a filter
            :return: self
            """
            where_filter = where_filter.copy()
            self._overwrite_id(where_filter)
            self._queries[0] = self._queries[0].with_where(where_filter)
            return self

        def filter_batched(self, filters) -> Any:
            """Find documents in the index based on a filter query
            :param filters: filters
            :return: self
            """
            adj_queries, adj_clauses = self._resize_queries_and_clauses(
                self._queries, filters
            )
            new_queries = []

            for query, clause in zip(adj_queries, adj_clauses):
                clause = clause.copy()
                self._overwrite_id(clause)
                new_queries.append(query.with_where(clause))

            self._queries = new_queries

            return self

        def text_search(self, query: str, search_field: Optional[str] = None) -> Any:
            """Find documents in the index based on a text search query

            :param query: The text to search for
            :param search_field: name of the field to search on
            :return: self
            """
            bm25: Dict[str, Any] = {"query": query}
            if search_field:
                bm25["properties"] = [search_field]
            self._queries[0] = self._queries[0].with_bm25(**bm25)
            return self

        def text_search_batched(
            self, queries: Sequence[str], search_field: Optional[str] = None
        ) -> Any:
            """Find documents in the index based on a text search query

            :param queries: The texts to search for
            :param search_field: name of the field to search on
            :return: self
            """
            adj_queries, adj_clauses = self._resize_queries_and_clauses(
                self._queries, queries
            )
            new_queries = []

            for query, clause in zip(adj_queries, adj_clauses):
                bm25 = {"query": clause}
                if search_field:
                    bm25["properties"] = [search_field]
                new_queries.append(query.with_bm25(**bm25))

            self._queries = new_queries

            return self

        def limit(self, limit: int) -> Any:
            self._queries = [query.with_limit(limit) for query in self._queries]
            return self

        def _resize_queries_and_clauses(self, queries, clauses):
            """
            Adjust the length and content of queries and clauses so that we can compose
            them element-wise
            """
            num_clauses = len(clauses)
            num_queries = len(queries)

            # if there's only one clause, then we assume that it should be applied
            # to every query
            if num_clauses == 1:
                return queries, clauses * num_queries
            # if there's only one query, then we can lengthen it to match the number
            # of clauses
            elif num_queries == 1:
                return [copy.deepcopy(queries[0]) for _ in range(num_clauses)], clauses
            # if the number of queries and clauses is the same, then we can just
            # return them as-is
            elif num_clauses == num_queries:
                return queries, clauses
            else:
                raise ValueError(
                    f"Can't compose {num_clauses} clauses with {num_queries} queries"
                )

`DBConfig` `dataclass`

Bases: DBConfig

Dataclass that contains all "static" configurations of WeaviateDocumentIndex.

Source code in docarray/index/backends/weaviate.py

@dataclass
class DBConfig(BaseDocIndex.DBConfig):
    """Dataclass that contains all "static" configurations of WeaviateDocumentIndex."""

    host: str = 'http://localhost:8080'
    index_name: Optional[str] = None
    username: Optional[str] = None
    password: Optional[str] = None
    scopes: List[str] = field(default_factory=lambda: ["offline_access"])
    auth_api_key: Optional[str] = None
    embedded_options: Optional[EmbeddedOptions] = None
    default_column_config: Dict[Any, Dict[str, Any]] = field(
        default_factory=lambda: {
            np.ndarray: {},
            docarray.typing.ID: {},
            'string': {},
            'text': {},
            'int': {},
            'number': {},
            'boolean': {},
            'number[]': {},
            'blob': {},
        }
    )

    def __post_init__(self):
        # To prevent errors, it is important to capitalize the provided index name
        # when working with Weaviate, as it stores index names in a capitalized format.
        # Can't use .capitalize() because it modifies the whole string (See test).
        self.index_name = (
            self.index_name[0].upper() + self.index_name[1:]
            if self.index_name
            else None
        )

`QueryBuilder`

Bases: QueryBuilder

Source code in docarray/index/backends/weaviate.py

class QueryBuilder(BaseDocIndex.QueryBuilder):
    def __init__(self, document_index):
        self._queries = [
            document_index._client.query.get(
                document_index.index_name, document_index.properties
            )
        ]

    def build(self, *args, **kwargs) -> Any:
        """Build the query object."""
        num_queries = len(self._queries)

        for i in range(num_queries):
            q = self._queries[i]
            if self._is_hybrid_query(q):
                self._make_proper_hybrid_query(q)
            q.with_additional(["vector"]).with_alias(f'query_{i}')

        return self

    def _is_hybrid_query(self, query: weaviate.gql.get.GetBuilder) -> bool:
        """
        Checks if a query has been composed with both a with_bm25 and a with_near_vector verb
        """
        if not query._near_ask:
            return False
        else:
            return query._bm25 and query._near_ask._content.get("vector", None)

    def _make_proper_hybrid_query(
        self, query: weaviate.gql.get.GetBuilder
    ) -> weaviate.gql.get.GetBuilder:
        """
        Modifies a query to be a proper hybrid query.

        In weaviate, a query with with_bm25 and with_near_vector verb is not a hybrid query.
        We need to use the with_hybrid verb to make it a hybrid query.
        """

        text_query = query._bm25.query
        vector_query = query._near_ask._content["vector"]
        hybrid_query = weaviate.gql.get.Hybrid(
            query=text_query, vector=vector_query, alpha=0.5
        )

        query._bm25 = None
        query._near_ask = None
        query._hybrid = hybrid_query

    def _overwrite_id(self, where_filter):
        """
        Overwrite the id field in the where filter to DOCUMENTID
        if the "id" field is present in the path
        """
        for key, value in where_filter.items():
            if key == "path" and value == ["id"]:
                where_filter[key] = [DOCUMENTID]
            elif isinstance(value, dict):
                self._overwrite_id(value)
            elif isinstance(value, list):
                for item in value:
                    if isinstance(item, dict):
                        self._overwrite_id(item)

    def find(
        self,
        query,
        score_name: Literal["certainty", "distance"] = "certainty",
        score_threshold: Optional[float] = None,
        **kwargs,
    ) -> Any:
        """
        Find k-nearest neighbors of the query.

        :param query: query vector for search. Has single axis.
        :param score_name: either `"certainty"` (default) or `"distance"`
        :param score_threshold: the threshold of the score
        :return: self
        """
        if kwargs.get('search_field'):
            logging.warning(
                'The search_field argument is not supported for the WeaviateDocumentIndex and will be ignored.'
            )

        near_vector = {
            "vector": query,
        }
        if score_threshold:
            near_vector[score_name] = score_threshold

        self._queries[0] = self._queries[0].with_near_vector(near_vector)
        return self

    def find_batched(
        self,
        queries,
        score_name: Literal["certainty", "distance"] = "certainty",
        score_threshold: Optional[float] = None,
    ) -> Any:
        """Find k-nearest neighbors of the query vectors.

        :param queries: query vector for KNN/ANN search.
            Can be either a tensor-like (np.array, torch.Tensor, etc.) with a,
            or a DocList.
            If a tensor-like is passed, it should have shape `(batch_size, vector_dim)`
        :param score_name: either `"certainty"` (default) or `"distance"`
        :param score_threshold: the threshold of the score
        :return: self
        """
        adj_queries, adj_clauses = self._resize_queries_and_clauses(
            self._queries, queries
        )
        new_queries = []

        for query, clause in zip(adj_queries, adj_clauses):
            near_vector = {
                "vector": clause,
            }
            if score_threshold:
                near_vector[score_name] = score_threshold

            new_queries.append(query.with_near_vector(near_vector))

        self._queries = new_queries

        return self

    def filter(self, where_filter: Any) -> Any:
        """Find documents in the index based on a filter query
        :param where_filter: a filter
        :return: self
        """
        where_filter = where_filter.copy()
        self._overwrite_id(where_filter)
        self._queries[0] = self._queries[0].with_where(where_filter)
        return self

    def filter_batched(self, filters) -> Any:
        """Find documents in the index based on a filter query
        :param filters: filters
        :return: self
        """
        adj_queries, adj_clauses = self._resize_queries_and_clauses(
            self._queries, filters
        )
        new_queries = []

        for query, clause in zip(adj_queries, adj_clauses):
            clause = clause.copy()
            self._overwrite_id(clause)
            new_queries.append(query.with_where(clause))

        self._queries = new_queries

        return self

    def text_search(self, query: str, search_field: Optional[str] = None) -> Any:
        """Find documents in the index based on a text search query

        :param query: The text to search for
        :param search_field: name of the field to search on
        :return: self
        """
        bm25: Dict[str, Any] = {"query": query}
        if search_field:
            bm25["properties"] = [search_field]
        self._queries[0] = self._queries[0].with_bm25(**bm25)
        return self

    def text_search_batched(
        self, queries: Sequence[str], search_field: Optional[str] = None
    ) -> Any:
        """Find documents in the index based on a text search query

        :param queries: The texts to search for
        :param search_field: name of the field to search on
        :return: self
        """
        adj_queries, adj_clauses = self._resize_queries_and_clauses(
            self._queries, queries
        )
        new_queries = []

        for query, clause in zip(adj_queries, adj_clauses):
            bm25 = {"query": clause}
            if search_field:
                bm25["properties"] = [search_field]
            new_queries.append(query.with_bm25(**bm25))

        self._queries = new_queries

        return self

    def limit(self, limit: int) -> Any:
        self._queries = [query.with_limit(limit) for query in self._queries]
        return self

    def _resize_queries_and_clauses(self, queries, clauses):
        """
        Adjust the length and content of queries and clauses so that we can compose
        them element-wise
        """
        num_clauses = len(clauses)
        num_queries = len(queries)

        # if there's only one clause, then we assume that it should be applied
        # to every query
        if num_clauses == 1:
            return queries, clauses * num_queries
        # if there's only one query, then we can lengthen it to match the number
        # of clauses
        elif num_queries == 1:
            return [copy.deepcopy(queries[0]) for _ in range(num_clauses)], clauses
        # if the number of queries and clauses is the same, then we can just
        # return them as-is
        elif num_clauses == num_queries:
            return queries, clauses
        else:
            raise ValueError(
                f"Can't compose {num_clauses} clauses with {num_queries} queries"
            )

`build(*args, **kwargs)`

Build the query object.

Source code in docarray/index/backends/weaviate.py

def build(self, *args, **kwargs) -> Any:
    """Build the query object."""
    num_queries = len(self._queries)

    for i in range(num_queries):
        q = self._queries[i]
        if self._is_hybrid_query(q):
            self._make_proper_hybrid_query(q)
        q.with_additional(["vector"]).with_alias(f'query_{i}')

    return self

`filter(where_filter)`

Find documents in the index based on a filter query

Parameters:

Name	Type	Description	Default
`where_filter`	`Any`	a filter	required

Returns:

Type	Description
`Any`	self

Source code in docarray/index/backends/weaviate.py

def filter(self, where_filter: Any) -> Any:
    """Find documents in the index based on a filter query
    :param where_filter: a filter
    :return: self
    """
    where_filter = where_filter.copy()
    self._overwrite_id(where_filter)
    self._queries[0] = self._queries[0].with_where(where_filter)
    return self

`filter_batched(filters)`

Find documents in the index based on a filter query

Parameters:

Name	Type	Description	Default
`filters`		filters	required

Returns:

Type	Description
`Any`	self

Source code in docarray/index/backends/weaviate.py

def filter_batched(self, filters) -> Any:
    """Find documents in the index based on a filter query
    :param filters: filters
    :return: self
    """
    adj_queries, adj_clauses = self._resize_queries_and_clauses(
        self._queries, filters
    )
    new_queries = []

    for query, clause in zip(adj_queries, adj_clauses):
        clause = clause.copy()
        self._overwrite_id(clause)
        new_queries.append(query.with_where(clause))

    self._queries = new_queries

    return self

`find(query, score_name='certainty', score_threshold=None, **kwargs)`

Find k-nearest neighbors of the query.

Parameters:

Name	Type	Description	Default
`query`		query vector for search. Has single axis.	required
`score_name`	`Literal['certainty', 'distance']`	either `"certainty"` (default) or `"distance"`	`'certainty'`
`score_threshold`	`Optional[float]`	the threshold of the score	`None`

Returns:

Type	Description
`Any`	self

Source code in docarray/index/backends/weaviate.py

def find(
    self,
    query,
    score_name: Literal["certainty", "distance"] = "certainty",
    score_threshold: Optional[float] = None,
    **kwargs,
) -> Any:
    """
    Find k-nearest neighbors of the query.

    :param query: query vector for search. Has single axis.
    :param score_name: either `"certainty"` (default) or `"distance"`
    :param score_threshold: the threshold of the score
    :return: self
    """
    if kwargs.get('search_field'):
        logging.warning(
            'The search_field argument is not supported for the WeaviateDocumentIndex and will be ignored.'
        )

    near_vector = {
        "vector": query,
    }
    if score_threshold:
        near_vector[score_name] = score_threshold

    self._queries[0] = self._queries[0].with_near_vector(near_vector)
    return self

`find_batched(queries, score_name='certainty', score_threshold=None)`

Find k-nearest neighbors of the query vectors.

Parameters:

Name	Type	Description	Default
`queries`		query vector for KNN/ANN search. Can be either a tensor-like (np.array, torch.Tensor, etc.) with a, or a DocList. If a tensor-like is passed, it should have shape `(batch_size, vector_dim)`	required
`score_name`	`Literal['certainty', 'distance']`	either `"certainty"` (default) or `"distance"`	`'certainty'`
`score_threshold`	`Optional[float]`	the threshold of the score	`None`

Returns:

Type	Description
`Any`	self

Source code in docarray/index/backends/weaviate.py

def find_batched(
    self,
    queries,
    score_name: Literal["certainty", "distance"] = "certainty",
    score_threshold: Optional[float] = None,
) -> Any:
    """Find k-nearest neighbors of the query vectors.

    :param queries: query vector for KNN/ANN search.
        Can be either a tensor-like (np.array, torch.Tensor, etc.) with a,
        or a DocList.
        If a tensor-like is passed, it should have shape `(batch_size, vector_dim)`
    :param score_name: either `"certainty"` (default) or `"distance"`
    :param score_threshold: the threshold of the score
    :return: self
    """
    adj_queries, adj_clauses = self._resize_queries_and_clauses(
        self._queries, queries
    )
    new_queries = []

    for query, clause in zip(adj_queries, adj_clauses):
        near_vector = {
            "vector": clause,
        }
        if score_threshold:
            near_vector[score_name] = score_threshold

        new_queries.append(query.with_near_vector(near_vector))

    self._queries = new_queries

    return self

`text_search(query, search_field=None)`

Find documents in the index based on a text search query

Parameters:

Name	Type	Description	Default
`query`	`str`	The text to search for	required
`search_field`	`Optional[str]`	name of the field to search on	`None`

Returns:

Type	Description
`Any`	self

Source code in docarray/index/backends/weaviate.py

def text_search(self, query: str, search_field: Optional[str] = None) -> Any:
    """Find documents in the index based on a text search query

    :param query: The text to search for
    :param search_field: name of the field to search on
    :return: self
    """
    bm25: Dict[str, Any] = {"query": query}
    if search_field:
        bm25["properties"] = [search_field]
    self._queries[0] = self._queries[0].with_bm25(**bm25)
    return self

`text_search_batched(queries, search_field=None)`

Find documents in the index based on a text search query

Parameters:

Name	Type	Description	Default
`queries`	`Sequence[str]`	The texts to search for	required
`search_field`	`Optional[str]`	name of the field to search on	`None`

Returns:

Type	Description
`Any`	self

Source code in docarray/index/backends/weaviate.py

def text_search_batched(
    self, queries: Sequence[str], search_field: Optional[str] = None
) -> Any:
    """Find documents in the index based on a text search query

    :param queries: The texts to search for
    :param search_field: name of the field to search on
    :return: self
    """
    adj_queries, adj_clauses = self._resize_queries_and_clauses(
        self._queries, queries
    )
    new_queries = []

    for query, clause in zip(adj_queries, adj_clauses):
        bm25 = {"query": clause}
        if search_field:
            bm25["properties"] = [search_field]
        new_queries.append(query.with_bm25(**bm25))

    self._queries = new_queries

    return self

`RuntimeConfig` `dataclass`

Bases: RuntimeConfig

Dataclass that contains all "dynamic" configurations of WeaviateDocumentIndex.

Source code in docarray/index/backends/weaviate.py

@dataclass
class RuntimeConfig(BaseDocIndex.RuntimeConfig):
    """Dataclass that contains all "dynamic" configurations of WeaviateDocumentIndex."""

    batch_config: Dict[str, Any] = field(
        default_factory=lambda: DEFAULT_BATCH_CONFIG
    )

`contains(item)`

Checks if a given document exists in the index.

Parameters:

Name	Type	Description	Default
`item`	`BaseDoc`	The document to check. It must be an instance of BaseDoc or its subclass.	required

Returns:

Type	Description
`bool`	True if the document exists in the index, False otherwise.

Source code in docarray/index/abstract.py

def __contains__(self, item: BaseDoc) -> bool:
    """
    Checks if a given document exists in the index.

    :param item: The document to check.
        It must be an instance of BaseDoc or its subclass.
    :return: True if the document exists in the index, False otherwise.
    """
    if safe_issubclass(type(item), BaseDoc):
        return self._doc_exists(str(item.id))
    else:
        raise TypeError(
            f"item must be an instance of BaseDoc or its subclass, not '{type(item).__name__}'"
        )

`delitem(key)`

Delete one or multiple Documents from the index, by id. If no document is found, a KeyError is raised.

Parameters:

Name	Type	Description	Default
`key`	`Union[str, Sequence[str]]`	id or ids to delete from the Document index	required

Source code in docarray/index/abstract.py

def __delitem__(self, key: Union[str, Sequence[str]]):
    """Delete one or multiple Documents from the index, by `id`.
    If no document is found, a KeyError is raised.

    :param key: id or ids to delete from the Document index
    """
    self._logger.info(f'Deleting documents with id(s) {key} from the index')
    if isinstance(key, str):
        key = [key]

    # delete nested data
    for field_name, type_, _ in self._flatten_schema(
        cast(Type[BaseDoc], self._schema)
    ):
        if safe_issubclass(type_, AnyDocArray):
            for doc_id in key:
                nested_docs_id = self._subindices[field_name]._filter_by_parent_id(
                    doc_id
                )
                if nested_docs_id:
                    del self._subindices[field_name][nested_docs_id]
    # delete data
    self._del_items(key)

`getitem(key)`

Get one or multiple Documents into the index, by id. If no document is found, a KeyError is raised.

Parameters:

Name	Type	Description	Default
`key`	`Union[str, Sequence[str]]`	id or ids to get from the Document index	required

Source code in docarray/index/abstract.py

def __getitem__(
    self, key: Union[str, Sequence[str]]
) -> Union[TSchema, DocList[TSchema]]:
    """Get one or multiple Documents into the index, by `id`.
    If no document is found, a KeyError is raised.

    :param key: id or ids to get from the Document index
    """
    # normalize input
    if isinstance(key, str):
        return_singleton = True
        key = [key]
    else:
        return_singleton = False

    # retrieve data
    doc_sequence = self._get_items(key)

    # check data
    if len(doc_sequence) == 0:
        raise KeyError(f'No document with id {key} found')

    # retrieve nested data
    for field_name, type_, _ in self._flatten_schema(
        cast(Type[BaseDoc], self._schema)
    ):
        if safe_issubclass(type_, AnyDocArray) and isinstance(
            doc_sequence[0], Dict
        ):
            for doc in doc_sequence:
                self._get_subindex_doclist(doc, field_name)  # type: ignore

    # cast output
    if isinstance(doc_sequence, DocList):
        out_docs: DocList[TSchema] = doc_sequence
    elif isinstance(doc_sequence[0], Dict):
        out_docs = self._dict_list_to_docarray(doc_sequence)  # type: ignore
    else:
        docs_cls = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema))
        out_docs = docs_cls(doc_sequence)

    return out_docs[0] if return_singleton else out_docs

`init(db_config=None, **kwargs)`

Initialize WeaviateDocumentIndex

Source code in docarray/index/backends/weaviate.py

def __init__(self, db_config=None, **kwargs) -> None:
    """Initialize WeaviateDocumentIndex"""

    self.embedding_column: Optional[str] = None
    self.properties: Optional[List[str]] = None
    # keep track of the column name that contains the bytes
    # type because we will store them as a base64 encoded string
    # in weaviate
    self.bytes_columns: List[str] = []
    # keep track of the array columns that are not embeddings because we will
    # convert them to python lists before uploading to weaviate
    self.nonembedding_array_columns: List[str] = []
    super().__init__(db_config=db_config, **kwargs)
    self._db_config: WeaviateDocumentIndex.DBConfig = cast(
        WeaviateDocumentIndex.DBConfig, self._db_config
    )
    self._runtime_config: WeaviateDocumentIndex.RuntimeConfig = cast(
        WeaviateDocumentIndex.RuntimeConfig, self._runtime_config
    )

    if self._db_config.embedded_options:
        self._client = weaviate.Client(
            embedded_options=self._db_config.embedded_options
        )
    else:
        self._client = weaviate.Client(
            self._db_config.host, auth_client_secret=self._build_auth_credentials()
        )

    self._configure_client()
    self._validate_columns()
    self._set_embedding_column()
    self._set_properties()
    self._create_schema()

`build_query()`

Build a query for WeaviateDocumentIndex.

Returns:

Type	Description
`QueryBuilder`	QueryBuilder object

Source code in docarray/index/backends/weaviate.py

def build_query(self) -> BaseDocIndex.QueryBuilder:
    """
    Build a query for WeaviateDocumentIndex.
    :return: QueryBuilder object
    """
    return self.QueryBuilder(self)

`configure(runtime_config=None, **kwargs)`

Configure the WeaviateDocumentIndex. You can either pass a config object to config or pass individual config parameters as keyword arguments. If a configuration object is passed, it will replace the current configuration. If keyword arguments are passed, they will update the current configuration.

Parameters:

Name	Type	Description	Default
`runtime_config`		the configuration to apply	`None`
`kwargs`		individual configuration parameters	`{}`

Source code in docarray/index/backends/weaviate.py

def configure(self, runtime_config=None, **kwargs) -> None:
    """
    Configure the WeaviateDocumentIndex.
    You can either pass a config object to `config` or pass individual config
    parameters as keyword arguments.
    If a configuration object is passed, it will replace the current configuration.
    If keyword arguments are passed, they will update the current configuration.

    :param runtime_config: the configuration to apply
    :param kwargs: individual configuration parameters
    """
    super().configure(runtime_config, **kwargs)
    self._configure_client()

`execute_query(query, *args, **kwargs)`

Execute a query on the WeaviateDocumentIndex.

Can take two kinds of inputs:

A native query of the underlying database. This is meant as a passthrough so that you can enjoy any functionality that is not available through the Document index API.
The output of this Document index' QueryBuilder.build() method.

Parameters:

Name	Type	Description	Default
`query`	`Any`	the query to execute	required
`args`		positional arguments to pass to the query	`()`
`kwargs`		keyword arguments to pass to the query	`{}`

Returns:

Type	Description
`Any`	the result of the query

Source code in docarray/index/backends/weaviate.py

def execute_query(self, query: Any, *args, **kwargs) -> Any:
    """
    Execute a query on the WeaviateDocumentIndex.

    Can take two kinds of inputs:

    1. A native query of the underlying database. This is meant as a passthrough so that you
    can enjoy any functionality that is not available through the Document index API.
    2. The output of this Document index' `QueryBuilder.build()` method.

    :param query: the query to execute
    :param args: positional arguments to pass to the query
    :param kwargs: keyword arguments to pass to the query
    :return: the result of the query
    """
    da_class = DocList.__class_getitem__(cast(Type[BaseDoc], self._schema))

    if isinstance(query, self.QueryBuilder):
        batched_results = self._client.query.multi_get(query._queries).do()
        batched_docs = batched_results["data"]["Get"].values()

        def f(doc):
            # TODO: use
            # return self._schema(**self._parse_weaviate_result(doc))
            # when https://github.com/weaviate/weaviate/issues/2858
            # is fixed
            return self._schema.from_view(self._parse_weaviate_result(doc))  # type: ignore

        results = [
            da_class([f(doc) for doc in batched_doc])
            for batched_doc in batched_docs
        ]
        return results if len(results) > 1 else results[0]

    # TODO: validate graphql query string before sending it to weaviate
    if isinstance(query, str):
        return self._client.query.raw(query)

`filter(filter_query, limit=10, **kwargs)`

Find documents in the index based on a filter query

Parameters:

Name	Type	Description	Default
`filter_query`	`Any`	the DB specific filter query to execute	required
`limit`	`int`	maximum number of documents to return	`10`

Returns:

Type	Description
`DocList`	a DocList containing the documents that match the filter query

Source code in docarray/index/abstract.py

def filter(
    self,
    filter_query: Any,
    limit: int = 10,
    **kwargs,
) -> DocList:
    """Find documents in the index based on a filter query

    :param filter_query: the DB specific filter query to execute
    :param limit: maximum number of documents to return
    :return: a DocList containing the documents that match the filter query
    """
    self._logger.debug(f'Executing `filter` for the query {filter_query}')
    docs = self._filter(filter_query, limit=limit, **kwargs)

    if isinstance(docs, List) and not isinstance(docs, DocList):
        docs = self._dict_list_to_docarray(docs)

    return docs

`filter_batched(filter_queries, limit=10, **kwargs)`

Find documents in the index based on multiple filter queries.

Parameters:

Name	Type	Description	Default
`filter_queries`	`Any`	the DB specific filter query to execute	required
`limit`	`int`	maximum number of documents to return	`10`

Returns:

Type	Description
`List[DocList]`	a DocList containing the documents that match the filter query

Source code in docarray/index/abstract.py

def filter_batched(
    self,
    filter_queries: Any,
    limit: int = 10,
    **kwargs,
) -> List[DocList]:
    """Find documents in the index based on multiple filter queries.

    :param filter_queries: the DB specific filter query to execute
    :param limit: maximum number of documents to return
    :return: a DocList containing the documents that match the filter query
    """
    self._logger.debug(
        f'Executing `filter_batched` for the queries {filter_queries}'
    )
    da_list = self._filter_batched(filter_queries, limit=limit, **kwargs)

    if len(da_list) > 0 and isinstance(da_list[0], List):
        da_list = [self._dict_list_to_docarray(docs) for docs in da_list]

    return da_list  # type: ignore

`filter_subindex(filter_query, subindex, limit=10, **kwargs)`

Find documents in subindex level based on a filter query

Parameters:

Name	Type	Description	Default
`filter_query`	`Any`	the DB specific filter query to execute	required
`subindex`	`str`	name of the subindex to search on	required
`limit`	`int`	maximum number of documents to return	`10`

Returns:

Type	Description
`DocList`	a DocList containing the subindex level documents that match the filter query

Source code in docarray/index/abstract.py

def filter_subindex(
    self,
    filter_query: Any,
    subindex: str,
    limit: int = 10,
    **kwargs,
) -> DocList:
    """Find documents in subindex level based on a filter query

    :param filter_query: the DB specific filter query to execute
    :param subindex: name of the subindex to search on
    :param limit: maximum number of documents to return
    :return: a DocList containing the subindex level documents that match the filter query
    """
    self._logger.debug(
        f'Executing `filter` for the query {filter_query} in subindex {subindex}'
    )
    if '__' in subindex:
        fields = subindex.split('__')
        return self._subindices[fields[0]].filter_subindex(
            filter_query, '__'.join(fields[1:]), limit=limit, **kwargs
        )
    else:
        return self._subindices[subindex].filter(
            filter_query, limit=limit, **kwargs
        )

`find(query, search_field='', limit=10, **kwargs)`

Find k-nearest neighbors of the query.

Parameters:

Name	Type	Description	Default
`query`	`Union[AnyTensor, BaseDoc]`	query vector for KNN/ANN search. Has single axis.	required
`search_field`	`str`	name of the field to search on	`''`
`limit`	`int`	maximum number of documents to return per query	`10`

Returns:

Type	Description
	a named tuple containing `documents` and `scores`

Source code in docarray/index/backends/weaviate.py

def find(
    self,
    query: Union[AnyTensor, BaseDoc],
    search_field: str = '',
    limit: int = 10,
    **kwargs,
):
    """
    Find k-nearest neighbors of the query.

    :param query: query vector for KNN/ANN search. Has single axis.
    :param search_field: name of the field to search on
    :param limit: maximum number of documents to return per query
    :return: a named tuple containing `documents` and `scores`
    """
    self._logger.debug('Executing `find`')
    if search_field != '':
        raise ValueError(
            'Argument search_field is not supported for WeaviateDocumentIndex.\nSet search_field to an empty string to proceed.'
        )
    embedding_field = self._get_embedding_field()
    if isinstance(query, BaseDoc):
        query_vec = self._get_values_by_column([query], embedding_field)[0]
    else:
        query_vec = query
    query_vec_np = self._to_numpy(query_vec)
    docs, scores = self._find(
        query_vec_np, search_field=search_field, limit=limit, **kwargs
    )

    if isinstance(docs, List) and not isinstance(docs, DocList):
        docs = self._dict_list_to_docarray(docs)

    return FindResult(documents=docs, scores=scores)

`find_batched(queries, search_field='', limit=10, **kwargs)`

Find documents in the index using nearest neighbor search.

Parameters:

Name	Type	Description	Default
`queries`	`Union[AnyTensor, DocList]`	query vector for KNN/ANN search. Can be either a tensor-like (np.array, torch.Tensor, etc.) with a, or a DocList. If a tensor-like is passed, it should have shape (batch_size, vector_dim)	required
`search_field`	`str`	name of the field to search on. Documents in the index are retrieved based on this similarity of this field to the query.	`''`
`limit`	`int`	maximum number of documents to return per query	`10`

Returns:

Type	Description
`FindResultBatched`	a named tuple containing `documents` and `scores`

Source code in docarray/index/backends/weaviate.py

def find_batched(
    self,
    queries: Union[AnyTensor, DocList],
    search_field: str = '',
    limit: int = 10,
    **kwargs: Any,
) -> FindResultBatched:
    """Find documents in the index using nearest neighbor search.

    :param queries: query vector for KNN/ANN search.
        Can be either a tensor-like (np.array, torch.Tensor, etc.) with a,
        or a DocList.
        If a tensor-like is passed, it should have shape (batch_size, vector_dim)
    :param search_field: name of the field to search on.
        Documents in the index are retrieved based on this similarity
        of this field to the query.
    :param limit: maximum number of documents to return per query
    :return: a named tuple containing `documents` and `scores`
    """
    self._logger.debug('Executing `find_batched`')
    if search_field != '':
        raise ValueError(
            'Argument search_field is not supported for WeaviateDocumentIndex.\nSet search_field to an empty string to proceed.'
        )
    embedding_field = self._get_embedding_field()

    if isinstance(queries, Sequence):
        query_vec_list = self._get_values_by_column(queries, embedding_field)
        query_vec_np = np.stack(
            tuple(self._to_numpy(query_vec) for query_vec in query_vec_list)
        )
    else:
        query_vec_np = self._to_numpy(queries)

    da_list, scores = self._find_batched(
        query_vec_np, search_field=search_field, limit=limit, **kwargs
    )

    if len(da_list) > 0 and isinstance(da_list[0], List):
        da_list = [self._dict_list_to_docarray(docs) for docs in da_list]

    return FindResultBatched(documents=da_list, scores=scores)  # type: ignore

`find_subindex(query, subindex='', search_field='', limit=10, **kwargs)`

Find documents in subindex level.

Parameters:

Name	Type	Description	Default
`query`	`Union[AnyTensor, BaseDoc]`	query vector for KNN/ANN search. Can be either a tensor-like (np.array, torch.Tensor, etc.) with a single axis, or a Document	required
`subindex`	`str`	name of the subindex to search on	`''`
`search_field`	`str`	name of the field to search on	`''`
`limit`	`int`	maximum number of documents to return	`10`

Returns:

Type	Description
`SubindexFindResult`	a named tuple containing root docs, subindex docs and scores

Source code in docarray/index/abstract.py

def find_subindex(
    self,
    query: Union[AnyTensor, BaseDoc],
    subindex: str = '',
    search_field: str = '',
    limit: int = 10,
    **kwargs,
) -> SubindexFindResult:
    """Find documents in subindex level.

    :param query: query vector for KNN/ANN search.
        Can be either a tensor-like (np.array, torch.Tensor, etc.)
        with a single axis, or a Document
    :param subindex: name of the subindex to search on
    :param search_field: name of the field to search on
    :param limit: maximum number of documents to return
    :return: a named tuple containing root docs, subindex docs and scores
    """
    self._logger.debug(f'Executing `find_subindex` for search field {search_field}')

    sub_docs, scores = self._find_subdocs(
        query, subindex=subindex, search_field=search_field, limit=limit, **kwargs
    )

    fields = subindex.split('__')
    root_ids = [
        self._get_root_doc_id(doc.id, fields[0], '__'.join(fields[1:]))
        for doc in sub_docs
    ]
    root_docs = DocList[self._schema]()  # type: ignore
    for id in root_ids:
        root_docs.append(self[id])

    return SubindexFindResult(
        root_documents=root_docs, sub_documents=sub_docs, scores=scores  # type: ignore
    )

`index(docs, **kwargs)`

index Documents into the index.

Note

Passing a sequence of Documents that is not a DocList (such as a List of Docs) comes at a performance penalty. This is because the Index needs to check compatibility between itself and the data. With a DocList as input this is a single check; for other inputs compatibility needs to be checked for every Document individually.

Parameters:

Name	Type	Description	Default
`docs`	`Union[BaseDoc, Sequence[BaseDoc]]`	Documents to index.	required

Source code in docarray/index/abstract.py

def index(self, docs: Union[BaseDoc, Sequence[BaseDoc]], **kwargs):
    """index Documents into the index.

    !!! note
        Passing a sequence of Documents that is not a DocList
        (such as a List of Docs) comes at a performance penalty.
        This is because the Index needs to check compatibility between itself and
        the data. With a DocList as input this is a single check; for other inputs
        compatibility needs to be checked for every Document individually.

    :param docs: Documents to index.
    """
    n_docs = 1 if isinstance(docs, BaseDoc) else len(docs)
    self._logger.debug(f'Indexing {n_docs} documents')
    docs_validated = self._validate_docs(docs)
    self._update_subindex_data(docs_validated)
    data_by_columns = self._get_col_value_dict(docs_validated)
    self._index(data_by_columns, **kwargs)

`num_docs()`

Get the number of documents.

Source code in docarray/index/backends/weaviate.py

def num_docs(self) -> int:
    """
    Get the number of documents.
    """
    index_name = self.index_name
    result = self._client.query.aggregate(index_name).with_meta_count().do()
    # TODO: decorator to check for errors
    total_docs = result["data"]["Aggregate"][index_name][0]["meta"]["count"]

    return total_docs

`python_type_to_db_type(python_type)`

Map python type to database type. Takes any python type and returns the corresponding database column type.

Parameters:

Name	Type	Description	Default
`python_type`	`Type`	a python type.	required

Returns:

Type	Description
`Any`	the corresponding database column type, or None if `python_type` is not supported.

Source code in docarray/index/backends/weaviate.py

def python_type_to_db_type(self, python_type: Type) -> Any:
    """Map python type to database type.
    Takes any python type and returns the corresponding database column type.

    :param python_type: a python type.
    :return: the corresponding database column type,
        or None if ``python_type`` is not supported.
    """
    for allowed_type in WEAVIATE_PY_VEC_TYPES:
        if safe_issubclass(python_type, allowed_type):
            return 'number[]'

    py_weaviate_type_map = {
        docarray.typing.ID: 'string',
        str: 'text',
        int: 'int',
        float: 'number',
        bool: 'boolean',
        np.ndarray: 'number[]',
        bytes: 'blob',
    }

    for py_type, weaviate_type in py_weaviate_type_map.items():
        if safe_issubclass(python_type, py_type):
            return weaviate_type

    raise ValueError(f'Unsupported column type for {type(self)}: {python_type}')

`subindex_contains(item)`

Checks if a given BaseDoc item is contained in the index or any of its subindices.

Parameters:

Name	Type	Description	Default
`item`	`BaseDoc`	the given BaseDoc	required

Returns:

Type	Description
`bool`	if the given BaseDoc item is contained in the index/subindices

Source code in docarray/index/abstract.py

def subindex_contains(self, item: BaseDoc) -> bool:
    """Checks if a given BaseDoc item is contained in the index or any of its subindices.

    :param item: the given BaseDoc
    :return: if the given BaseDoc item is contained in the index/subindices
    """
    if self._is_index_empty:
        return False

    if safe_issubclass(type(item), BaseDoc):
        return self.__contains__(item) or any(
            index.subindex_contains(item) for index in self._subindices.values()
        )
    else:
        raise TypeError(
            f"item must be an instance of BaseDoc or its subclass, not '{type(item).__name__}'"
        )

`text_search(query, search_field='', limit=10, **kwargs)`

Find documents in the index based on a text search query.

Parameters:

Name	Type	Description	Default
`query`	`Union[str, BaseDoc]`	The text to search for	required
`search_field`	`str`	name of the field to search on	`''`
`limit`	`int`	maximum number of documents to return	`10`

Returns:

Type	Description
`FindResult`	a named tuple containing `documents` and `scores`

Source code in docarray/index/abstract.py

def text_search(
    self,
    query: Union[str, BaseDoc],
    search_field: str = '',
    limit: int = 10,
    **kwargs,
) -> FindResult:
    """Find documents in the index based on a text search query.

    :param query: The text to search for
    :param search_field: name of the field to search on
    :param limit: maximum number of documents to return
    :return: a named tuple containing `documents` and `scores`
    """
    self._logger.debug(f'Executing `text_search` for search field {search_field}')
    self._validate_search_field(search_field)
    if isinstance(query, BaseDoc):
        query_text = self._get_values_by_column([query], search_field)[0]
    else:
        query_text = query
    docs, scores = self._text_search(
        query_text, search_field=search_field, limit=limit, **kwargs
    )

    if isinstance(docs, List) and not isinstance(docs, DocList):
        docs = self._dict_list_to_docarray(docs)

    return FindResult(documents=docs, scores=scores)

`text_search_batched(queries, search_field='', limit=10, **kwargs)`

Find documents in the index based on a text search query.

Parameters:

Name	Type	Description	Default
`queries`	`Union[Sequence[str], Sequence[BaseDoc]]`	The texts to search for	required
`search_field`	`str`	name of the field to search on	`''`
`limit`	`int`	maximum number of documents to return	`10`

Returns:

Type	Description
`FindResultBatched`	a named tuple containing `documents` and `scores`

Source code in docarray/index/abstract.py

def text_search_batched(
    self,
    queries: Union[Sequence[str], Sequence[BaseDoc]],
    search_field: str = '',
    limit: int = 10,
    **kwargs,
) -> FindResultBatched:
    """Find documents in the index based on a text search query.

    :param queries: The texts to search for
    :param search_field: name of the field to search on
    :param limit: maximum number of documents to return
    :return: a named tuple containing `documents` and `scores`
    """
    self._logger.debug(
        f'Executing `text_search_batched` for search field {search_field}'
    )
    self._validate_search_field(search_field)
    if isinstance(queries[0], BaseDoc):
        query_docs: Sequence[BaseDoc] = cast(Sequence[BaseDoc], queries)
        query_texts: Sequence[str] = self._get_values_by_column(
            query_docs, search_field
        )
    else:
        query_texts = cast(Sequence[str], queries)
    da_list, scores = self._text_search_batched(
        query_texts, search_field=search_field, limit=limit, **kwargs
    )

    if len(da_list) > 0 and isinstance(da_list[0], List):
        docs = [self._dict_list_to_docarray(docs) for docs in da_list]
        return FindResultBatched(documents=docs, scores=scores)

    da_list_ = cast(List[DocList], da_list)
    return FindResultBatched(documents=da_list_, scores=scores)

WeaviateDocumentIndex

docarray.index.backends.weaviate.WeaviateDocumentIndex

DBConfig dataclass

QueryBuilder

build(*args, **kwargs)

filter(where_filter)

filter_batched(filters)

find(query, score_name='certainty', score_threshold=None, **kwargs)

find_batched(queries, score_name='certainty', score_threshold=None)

text_search(query, search_field=None)

text_search_batched(queries, search_field=None)

RuntimeConfig dataclass

__contains__(item)

__delitem__(key)

__getitem__(key)

__init__(db_config=None, **kwargs)

build_query()

configure(runtime_config=None, **kwargs)

execute_query(query, *args, **kwargs)

filter(filter_query, limit=10, **kwargs)

filter_batched(filter_queries, limit=10, **kwargs)

filter_subindex(filter_query, subindex, limit=10, **kwargs)

find(query, search_field='', limit=10, **kwargs)

find_batched(queries, search_field='', limit=10, **kwargs)

find_subindex(query, subindex='', search_field='', limit=10, **kwargs)

index(docs, **kwargs)

num_docs()

python_type_to_db_type(python_type)

subindex_contains(item)

text_search(query, search_field='', limit=10, **kwargs)

text_search_batched(queries, search_field='', limit=10, **kwargs)

`docarray.index.backends.weaviate.WeaviateDocumentIndex`

`DBConfig` `dataclass`

`QueryBuilder`

`build(*args, **kwargs)`

`filter(where_filter)`

`filter_batched(filters)`

`find(query, score_name='certainty', score_threshold=None, **kwargs)`

`find_batched(queries, score_name='certainty', score_threshold=None)`

`text_search(query, search_field=None)`

`text_search_batched(queries, search_field=None)`

`RuntimeConfig` `dataclass`

`contains(item)`

`delitem(key)`

`getitem(key)`

`init(db_config=None, **kwargs)`

`build_query()`

`configure(runtime_config=None, **kwargs)`

`execute_query(query, *args, **kwargs)`

`filter(filter_query, limit=10, **kwargs)`

`filter_batched(filter_queries, limit=10, **kwargs)`

`filter_subindex(filter_query, subindex, limit=10, **kwargs)`

`find(query, search_field='', limit=10, **kwargs)`

`find_batched(queries, search_field='', limit=10, **kwargs)`

`find_subindex(query, subindex='', search_field='', limit=10, **kwargs)`

`index(docs, **kwargs)`

`num_docs()`

`python_type_to_db_type(python_type)`

`subindex_contains(item)`

`text_search(query, search_field='', limit=10, **kwargs)`

`text_search_batched(queries, search_field='', limit=10, **kwargs)`