Skip to content

find

docarray.utils.find

find(index, query, search_field='', metric='cosine_sim', limit=10, device=None, descending=None, cache=None)

Find the closest Documents in the index to the query. Supports PyTorch and NumPy embeddings.

Note

This is a simple implementation of exact search. If you need to do advance search using approximate nearest neighbours search or hybrid search or multi vector search please take a look at the BaseDoc.


from docarray import DocList, BaseDoc
from docarray.typing import TorchTensor
from docarray.utils.find import find
import torch


class MyDocument(BaseDoc):
    embedding: TorchTensor


index = DocList[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(100)])

# use Document as query
query = MyDocument(embedding=torch.rand(128))
top_matches, scores = find(
    index=index,
    query=query,
    search_field='embedding',
    metric='cosine_sim',
)

# use tensor as query
query = torch.rand(128)
top_matches, scores = find(
    index=index,
    query=query,
    search_field='embedding',
    metric='cosine_sim',
)

Parameters:

Name Type Description Default
index AnyDocArray

the index of Documents to search in

required
query Union[AnyTensor, BaseDoc]

the query to search for

required
search_field str

the tensor-like field in the index to use for the similarity computation

''
metric str

the distance metric to use for the similarity computation. Can be one of the following strings: 'cosine_sim' for cosine similarity, 'euclidean_dist' for euclidean distance, 'sqeuclidean_dist' for squared euclidean distance

'cosine_sim'
limit int

return the top limit results

10
device Optional[str]

the computational device to use, can be either cpu or a cuda device.

None
descending Optional[bool]

sort the results in descending order. Per default, this is chosen based on the metric argument.

None
cache Optional[Dict[str, Tuple[AnyTensor, Optional[List[int]]]]]

Precomputed data storing the valid index data per search field together with the valid indexes to account for deleted entries.

None

Returns:

Type Description
FindResult

A named tuple of the form (DocList, AnyTensor), where the first element contains the closes matches for the query, and the second element contains the corresponding scores.

Source code in docarray/utils/find.py
def find(
    index: AnyDocArray,
    query: Union[AnyTensor, BaseDoc],
    search_field: str = '',
    metric: str = 'cosine_sim',
    limit: int = 10,
    device: Optional[str] = None,
    descending: Optional[bool] = None,
    cache: Optional[Dict[str, Tuple[AnyTensor, Optional[List[int]]]]] = None,
) -> FindResult:
    """
    Find the closest Documents in the index to the query.
    Supports PyTorch and NumPy embeddings.

    !!! note
        This is a simple implementation of exact search. If you need to do advance
        search using approximate nearest neighbours search or hybrid search or
        multi vector search please take a look at the [`BaseDoc`][docarray.base_doc.doc.BaseDoc].

    ---

    ```python
    from docarray import DocList, BaseDoc
    from docarray.typing import TorchTensor
    from docarray.utils.find import find
    import torch


    class MyDocument(BaseDoc):
        embedding: TorchTensor


    index = DocList[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(100)])

    # use Document as query
    query = MyDocument(embedding=torch.rand(128))
    top_matches, scores = find(
        index=index,
        query=query,
        search_field='embedding',
        metric='cosine_sim',
    )

    # use tensor as query
    query = torch.rand(128)
    top_matches, scores = find(
        index=index,
        query=query,
        search_field='embedding',
        metric='cosine_sim',
    )
    ```

    ---

    :param index: the index of Documents to search in
    :param query: the query to search for
    :param search_field: the tensor-like field in the index to use
        for the similarity computation
    :param metric: the distance metric to use for the similarity computation.
        Can be one of the following strings:
        'cosine_sim' for cosine similarity, 'euclidean_dist' for euclidean distance,
        'sqeuclidean_dist' for squared euclidean distance
    :param limit: return the top `limit` results
    :param device: the computational device to use,
        can be either `cpu` or a `cuda` device.
    :param descending: sort the results in descending order.
        Per default, this is chosen based on the `metric` argument.
    :param cache: Precomputed data storing the valid index data per search field together with the valid indexes to account for deleted entries.
    :return: A named tuple of the form (DocList, AnyTensor),
        where the first element contains the closes matches for the query,
        and the second element contains the corresponding scores.
    """
    query = _extract_embedding_single(query, search_field)
    docs, scores = find_batched(
        index=index,
        query=query,
        search_field=search_field,
        metric=metric,
        limit=limit,
        device=device,
        descending=descending,
        cache=cache,
    )
    return FindResult(documents=docs[0], scores=scores[0])

find_batched(index, query, search_field='', metric='cosine_sim', limit=10, device=None, descending=None, cache=None)

Find the closest Documents in the index to the queries. Supports PyTorch and NumPy embeddings.

Note

This is a simple implementation of exact search. If you need to do advance search using approximate nearest neighbours search or hybrid search or multi vector search please take a look at the BaseDoc

Note

Only non-None embeddings will be considered from the index array


from docarray import DocList, BaseDoc
from docarray.typing import TorchTensor
from docarray.utils.find import find_batched
import torch


class MyDocument(BaseDoc):
    embedding: TorchTensor


index = DocList[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(100)])

# use DocList as query
query = DocList[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(3)])
docs, scores = find_batched(
    index=index,
    query=query,
    search_field='embedding',
    metric='cosine_sim',
)
top_matches, scores = docs[0], scores[0]

# use tensor as query
query = torch.rand(3, 128)
docs, scores = find_batched(
    index=index,
    query=query,
    search_field='embedding',
    metric='cosine_sim',
)
top_matches, scores = docs[0], scores[0]

Parameters:

Name Type Description Default
index AnyDocArray

the index of Documents to search in

required
query Union[AnyTensor, DocList]

the query to search for

required
search_field str

the tensor-like field in the index to use for the similarity computation

''
metric str

the distance metric to use for the similarity computation. Can be one of the following strings: 'cosine_sim' for cosine similarity, 'euclidean_dist' for euclidean distance, 'sqeuclidean_dist' for squared euclidean distance

'cosine_sim'
limit int

return the top limit results

10
device Optional[str]

the computational device to use, can be either cpu or a cuda device.

None
descending Optional[bool]

sort the results in descending order. Per default, this is chosen based on the metric argument.

None
cache Optional[Dict[str, Tuple[AnyTensor, Optional[List[int]]]]]

Precomputed data storing the valid index data per search field together with the valid indexes to account for deleted entries.

None

Returns:

Type Description
FindResultBatched

A named tuple of the form (DocList, AnyTensor), where the first element contains the closest matches for each query, and the second element contains the corresponding scores.

Source code in docarray/utils/find.py
def find_batched(
    index: AnyDocArray,
    query: Union[AnyTensor, DocList],
    search_field: str = '',
    metric: str = 'cosine_sim',
    limit: int = 10,
    device: Optional[str] = None,
    descending: Optional[bool] = None,
    cache: Optional[Dict[str, Tuple[AnyTensor, Optional[List[int]]]]] = None,
) -> FindResultBatched:
    """
    Find the closest Documents in the index to the queries.
    Supports PyTorch and NumPy embeddings.

    !!! note
        This is a simple implementation of exact search. If you need to do advance
        search using approximate nearest neighbours search or hybrid search or
        multi vector search please take a look at the [`BaseDoc`][docarray.base_doc.doc.BaseDoc]

    !!! note
        Only non-None embeddings will be considered from the `index` array

    ---

    ```python
    from docarray import DocList, BaseDoc
    from docarray.typing import TorchTensor
    from docarray.utils.find import find_batched
    import torch


    class MyDocument(BaseDoc):
        embedding: TorchTensor


    index = DocList[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(100)])

    # use DocList as query
    query = DocList[MyDocument]([MyDocument(embedding=torch.rand(128)) for _ in range(3)])
    docs, scores = find_batched(
        index=index,
        query=query,
        search_field='embedding',
        metric='cosine_sim',
    )
    top_matches, scores = docs[0], scores[0]

    # use tensor as query
    query = torch.rand(3, 128)
    docs, scores = find_batched(
        index=index,
        query=query,
        search_field='embedding',
        metric='cosine_sim',
    )
    top_matches, scores = docs[0], scores[0]
    ```

    ---

    :param index: the index of Documents to search in
    :param query: the query to search for
    :param search_field: the tensor-like field in the index to use
        for the similarity computation
    :param metric: the distance metric to use for the similarity computation.
        Can be one of the following strings:
        'cosine_sim' for cosine similarity, 'euclidean_dist' for euclidean distance,
        'sqeuclidean_dist' for squared euclidean distance
    :param limit: return the top `limit` results
    :param device: the computational device to use,
        can be either `cpu` or a `cuda` device.
    :param descending: sort the results in descending order.
        Per default, this is chosen based on the `metric` argument.
    :param cache: Precomputed data storing the valid index data per search field together with the valid indexes to account for deleted entries.
    :return: A named tuple of the form (DocList, AnyTensor),
        where the first element contains the closest matches for each query,
        and the second element contains the corresponding scores.
    """
    if descending is None:
        descending = metric.endswith('_sim')  # similarity metrics are descending

    # extract embeddings from query and index
    if cache is not None and search_field in cache:
        index_embeddings, valid_idx = cache[search_field]
    else:
        index_embeddings, valid_idx = _extract_embeddings(index, search_field)
        if cache is not None:
            cache[search_field] = (
                index_embeddings,
                valid_idx,
            )  # cache embedding for next query
    query_embeddings, _ = _extract_embeddings(query, search_field)
    _, comp_backend = _get_tensor_type_and_comp_backend_from_tensor(index_embeddings)

    # compute distances and return top results
    metric_fn = getattr(comp_backend.Metrics, metric)
    dists = metric_fn(query_embeddings, index_embeddings, device=device)
    top_scores, top_indices = comp_backend.Retrieval.top_k(
        dists, k=int(limit), device=device, descending=descending
    )

    batched_docs: List[DocList] = []
    candidate_index = index
    if valid_idx is not None and len(valid_idx) < len(index):
        candidate_index = index[valid_idx]
    scores = []
    for _, (indices_per_query, scores_per_query) in enumerate(
        zip(top_indices, top_scores)
    ):
        docs_per_query: DocList = candidate_index[indices_per_query]
        batched_docs.append(docs_per_query)
        scores.append(scores_per_query)
    return FindResultBatched(documents=batched_docs, scores=scores)