find

`docarray.utils.find`

`find(index, query, search_field='', metric='cosine_sim', limit=10, device=None, descending=None, cache=None)`

Find the closest Documents in the index to the query. Supports PyTorch and NumPy embeddings.

Note

This is a simple implementation of exact search. If you need to do advance search using approximate nearest neighbours search or hybrid search or multi vector search please take a look at the BaseDoc.

from docarray import DocList, BaseDoc
from docarray.typing import TorchTensor
from docarray.utils.find import find
import torch


class MyDocument(BaseDoc):
    embedding: TorchTensor


index = DocList[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(100)])

# use Document as query
query = MyDocument(embedding=torch.rand(128))
top_matches, scores = find(
    index=index,
    query=query,
    search_field='embedding',
    metric='cosine_sim',
)

# use tensor as query
query = torch.rand(128)
top_matches, scores = find(
    index=index,
    query=query,
    search_field='embedding',
    metric='cosine_sim',
)

Parameters:

Name	Type	Description	Default
`index`	`AnyDocArray`	the index of Documents to search in	required
`query`	`Union[AnyTensor, BaseDoc]`	the query to search for	required
`search_field`	`str`	the tensor-like field in the index to use for the similarity computation	`''`
`metric`	`str`	the distance metric to use for the similarity computation. Can be one of the following strings: 'cosine_sim' for cosine similarity, 'euclidean_dist' for euclidean distance, 'sqeuclidean_dist' for squared euclidean distance	`'cosine_sim'`
`limit`	`int`	return the top `limit` results	`10`
`device`	`Optional[str]`	the computational device to use, can be either `cpu` or a `cuda` device.	`None`
`descending`	`Optional[bool]`	sort the results in descending order. Per default, this is chosen based on the `metric` argument.	`None`
`cache`	`Optional[Dict[str, Tuple[AnyTensor, Optional[List[int]]]]]`	Precomputed data storing the valid index data per search field together with the valid indexes to account for deleted entries.	`None`

Returns:

Type	Description
`FindResult`	A named tuple of the form (DocList, AnyTensor), where the first element contains the closes matches for the query, and the second element contains the corresponding scores.

Source code in docarray/utils/find.py

def find(
    index: AnyDocArray,
    query: Union[AnyTensor, BaseDoc],
    search_field: str = '',
    metric: str = 'cosine_sim',
    limit: int = 10,
    device: Optional[str] = None,
    descending: Optional[bool] = None,
    cache: Optional[Dict[str, Tuple[AnyTensor, Optional[List[int]]]]] = None,
) -> FindResult:
    """
    Find the closest Documents in the index to the query.
    Supports PyTorch and NumPy embeddings.

    !!! note
        This is a simple implementation of exact search. If you need to do advance
        search using approximate nearest neighbours search or hybrid search or
        multi vector search please take a look at the [`BaseDoc`][docarray.base_doc.doc.BaseDoc].

    ---

    ```python
    from docarray import DocList, BaseDoc
    from docarray.typing import TorchTensor
    from docarray.utils.find import find
    import torch


    class MyDocument(BaseDoc):
        embedding: TorchTensor


    index = DocList[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(100)])

    # use Document as query
    query = MyDocument(embedding=torch.rand(128))
    top_matches, scores = find(
        index=index,
        query=query,
        search_field='embedding',
        metric='cosine_sim',
    )

    # use tensor as query
    query = torch.rand(128)
    top_matches, scores = find(
        index=index,
        query=query,
        search_field='embedding',
        metric='cosine_sim',
    )
    ```

    ---

    :param index: the index of Documents to search in
    :param query: the query to search for
    :param search_field: the tensor-like field in the index to use
        for the similarity computation
    :param metric: the distance metric to use for the similarity computation.
        Can be one of the following strings:
        'cosine_sim' for cosine similarity, 'euclidean_dist' for euclidean distance,
        'sqeuclidean_dist' for squared euclidean distance
    :param limit: return the top `limit` results
    :param device: the computational device to use,
        can be either `cpu` or a `cuda` device.
    :param descending: sort the results in descending order.
        Per default, this is chosen based on the `metric` argument.
    :param cache: Precomputed data storing the valid index data per search field together with the valid indexes to account for deleted entries.
    :return: A named tuple of the form (DocList, AnyTensor),
        where the first element contains the closes matches for the query,
        and the second element contains the corresponding scores.
    """
    query = _extract_embedding_single(query, search_field)
    docs, scores = find_batched(
        index=index,
        query=query,
        search_field=search_field,
        metric=metric,
        limit=limit,
        device=device,
        descending=descending,
        cache=cache,
    )
    return FindResult(documents=docs[0], scores=scores[0])

`find_batched(index, query, search_field='', metric='cosine_sim', limit=10, device=None, descending=None, cache=None)`

Find the closest Documents in the index to the queries. Supports PyTorch and NumPy embeddings.

Note

This is a simple implementation of exact search. If you need to do advance search using approximate nearest neighbours search or hybrid search or multi vector search please take a look at the BaseDoc

Note

Only non-None embeddings will be considered from the index array

from docarray import DocList, BaseDoc
from docarray.typing import TorchTensor
from docarray.utils.find import find_batched
import torch


class MyDocument(BaseDoc):
    embedding: TorchTensor


index = DocList[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(100)])

# use DocList as query
query = DocList[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(3)])
docs, scores = find_batched(
    index=index,
    query=query,
    search_field='embedding',
    metric='cosine_sim',
)
top_matches, scores = docs[0], scores[0]

# use tensor as query
query = torch.rand(3, 128)
docs, scores = find_batched(
    index=index,
    query=query,
    search_field='embedding',
    metric='cosine_sim',
)
top_matches, scores = docs[0], scores[0]

Parameters:

Name	Type	Description	Default
`index`	`AnyDocArray`	the index of Documents to search in	required
`query`	`Union[AnyTensor, DocList]`	the query to search for	required
`search_field`	`str`	the tensor-like field in the index to use for the similarity computation	`''`
`metric`	`str`	the distance metric to use for the similarity computation. Can be one of the following strings: 'cosine_sim' for cosine similarity, 'euclidean_dist' for euclidean distance, 'sqeuclidean_dist' for squared euclidean distance	`'cosine_sim'`
`limit`	`int`	return the top `limit` results	`10`
`device`	`Optional[str]`	the computational device to use, can be either `cpu` or a `cuda` device.	`None`
`descending`	`Optional[bool]`	sort the results in descending order. Per default, this is chosen based on the `metric` argument.	`None`
`cache`	`Optional[Dict[str, Tuple[AnyTensor, Optional[List[int]]]]]`	Precomputed data storing the valid index data per search field together with the valid indexes to account for deleted entries.	`None`

Returns:

Type	Description
`FindResultBatched`	A named tuple of the form (DocList, AnyTensor), where the first element contains the closest matches for each query, and the second element contains the corresponding scores.

Source code in docarray/utils/find.py

def find_batched(
    index: AnyDocArray,
    query: Union[AnyTensor, DocList],
    search_field: str = '',
    metric: str = 'cosine_sim',
    limit: int = 10,
    device: Optional[str] = None,
    descending: Optional[bool] = None,
    cache: Optional[Dict[str, Tuple[AnyTensor, Optional[List[int]]]]] = None,
) -> FindResultBatched:
    """
    Find the closest Documents in the index to the queries.
    Supports PyTorch and NumPy embeddings.

    !!! note
        This is a simple implementation of exact search. If you need to do advance
        search using approximate nearest neighbours search or hybrid search or
        multi vector search please take a look at the [`BaseDoc`][docarray.base_doc.doc.BaseDoc]

    !!! note
        Only non-None embeddings will be considered from the `index` array

    ---

    ```python
    from docarray import DocList, BaseDoc
    from docarray.typing import TorchTensor
    from docarray.utils.find import find_batched
    import torch


    class MyDocument(BaseDoc):
        embedding: TorchTensor


    index = DocList[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(100)])

    # use DocList as query
    query = DocList[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(3)])
    docs, scores = find_batched(
        index=index,
        query=query,
        search_field='embedding',
        metric='cosine_sim',
    )
    top_matches, scores = docs[0], scores[0]

    # use tensor as query
    query = torch.rand(3, 128)
    docs, scores = find_batched(
        index=index,
        query=query,
        search_field='embedding',
        metric='cosine_sim',
    )
    top_matches, scores = docs[0], scores[0]
    ```

    ---

    :param index: the index of Documents to search in
    :param query: the query to search for
    :param search_field: the tensor-like field in the index to use
        for the similarity computation
    :param metric: the distance metric to use for the similarity computation.
        Can be one of the following strings:
        'cosine_sim' for cosine similarity, 'euclidean_dist' for euclidean distance,
        'sqeuclidean_dist' for squared euclidean distance
    :param limit: return the top `limit` results
    :param device: the computational device to use,
        can be either `cpu` or a `cuda` device.
    :param descending: sort the results in descending order.
        Per default, this is chosen based on the `metric` argument.
    :param cache: Precomputed data storing the valid index data per search field together with the valid indexes to account for deleted entries.
    :return: A named tuple of the form (DocList, AnyTensor),
        where the first element contains the closest matches for each query,
        and the second element contains the corresponding scores.
    """
    if descending is None:
        descending = metric.endswith('_sim')  # similarity metrics are descending

    # extract embeddings from query and index
    if cache is not None and search_field in cache:
        index_embeddings, valid_idx = cache[search_field]
    else:
        index_embeddings, valid_idx = _extract_embeddings(index, search_field)
        if cache is not None:
            cache[search_field] = (
                index_embeddings,
                valid_idx,
            )  # cache embedding for next query
    query_embeddings, _ = _extract_embeddings(query, search_field)
    _, comp_backend = _get_tensor_type_and_comp_backend_from_tensor(index_embeddings)

    # compute distances and return top results
    metric_fn = getattr(comp_backend.Metrics, metric)
    dists = metric_fn(query_embeddings, index_embeddings, device=device)
    top_scores, top_indices = comp_backend.Retrieval.top_k(
        dists, k=int(limit), device=device, descending=descending
    )

    batched_docs: List[DocList] = []
    candidate_index = index
    if valid_idx is not None and len(valid_idx) < len(index):
        candidate_index = index[valid_idx]
    scores = []
    for _, (indices_per_query, scores_per_query) in enumerate(
        zip(top_indices, top_scores)
    ):
        docs_per_query: DocList = candidate_index[indices_per_query]
        batched_docs.append(docs_per_query)
        scores.append(scores_per_query)
    return FindResultBatched(documents=batched_docs, scores=scores)