From 32ab8317cbf3ef89a711a57090210e36ea49b1de Mon Sep 17 00:00:00 2001 From: Miguel Grinberg Date: Mon, 30 Sep 2024 16:02:23 +0100 Subject: [PATCH 1/3] Vectorstore: use a retriever query for hybrid search Fixes #2651 --- .../helpers/vectorstore/_async/strategies.py | 53 ++++-- .../helpers/vectorstore/_sync/strategies.py | 53 ++++-- .../test_vectorstore/test_vectorstore.py | 174 ++++++++++++------ 3 files changed, 190 insertions(+), 90 deletions(-) diff --git a/elasticsearch/helpers/vectorstore/_async/strategies.py b/elasticsearch/helpers/vectorstore/_async/strategies.py index a7f813f43..8f641f8f3 100644 --- a/elasticsearch/helpers/vectorstore/_async/strategies.py +++ b/elasticsearch/helpers/vectorstore/_async/strategies.py @@ -283,31 +283,46 @@ def _hybrid( ) -> Dict[str, Any]: # Add a query to the knn query. # RRF is used to even the score from the knn query and text query - # RRF has two optional parameters: {'rank_constant':int, 'window_size':int} + # RRF has two optional parameters: {'rank_constant':int, 'rank_window_size':int} # https://www.elastic.co/guide/en/elasticsearch/reference/current/rrf.html + rrf_options = {} + if isinstance(self.rrf, Dict): + if "rank_constant" in self.rrf: + rrf_options["rank_constant"] = self.rrf["rank_constant"] + if "window_size" in self.rrf: + # 'window_size' was renamed to 'rank_window_size', but we support + # the older name for backwards compatiblit + rrf_options["rank_window_size"] = self.rrf["window_size"] + if "rank_window_size" in self.rrf: + rrf_options["rank_window_size"] = self.rrf["rank_window_size"] query_body = { - "knn": knn, - "query": { - "bool": { - "must": [ + "retriever": { + "rrf": { + "retrievers": [ { - "match": { - self.text_field: { - "query": query, - } - } - } + "standard": { + "query": { + "bool": { + "must": [ + { + "match": { + self.text_field: { + "query": query, + } + } + } + ], + "filter": filter, + } + }, + }, + }, + {"knn": knn}, ], - "filter": filter, - } + **rrf_options, + }, }, } - - if isinstance(self.rrf, Dict): - query_body["rank"] = {"rrf": self.rrf} - elif isinstance(self.rrf, bool) and self.rrf is True: - query_body["rank"] = {"rrf": {}} - return query_body def needs_inference(self) -> bool: diff --git a/elasticsearch/helpers/vectorstore/_sync/strategies.py b/elasticsearch/helpers/vectorstore/_sync/strategies.py index 928d34143..6029928af 100644 --- a/elasticsearch/helpers/vectorstore/_sync/strategies.py +++ b/elasticsearch/helpers/vectorstore/_sync/strategies.py @@ -283,31 +283,46 @@ def _hybrid( ) -> Dict[str, Any]: # Add a query to the knn query. # RRF is used to even the score from the knn query and text query - # RRF has two optional parameters: {'rank_constant':int, 'window_size':int} + # RRF has two optional parameters: {'rank_constant':int, 'rank_window_size':int} # https://www.elastic.co/guide/en/elasticsearch/reference/current/rrf.html + rrf_options = {} + if isinstance(self.rrf, Dict): + if "rank_constant" in self.rrf: + rrf_options["rank_constant"] = self.rrf["rank_constant"] + if "window_size" in self.rrf: + # 'window_size' was renamed to 'rank_window_size', but we support + # the older name for backwards compatiblit + rrf_options["rank_window_size"] = self.rrf["window_size"] + if "rank_window_size" in self.rrf: + rrf_options["rank_window_size"] = self.rrf["rank_window_size"] query_body = { - "knn": knn, - "query": { - "bool": { - "must": [ + "retriever": { + "rrf": { + "retrievers": [ { - "match": { - self.text_field: { - "query": query, - } - } - } + "standard": { + "query": { + "bool": { + "must": [ + { + "match": { + self.text_field: { + "query": query, + } + } + } + ], + "filter": filter, + } + }, + }, + }, + {"knn": knn}, ], - "filter": filter, - } + **rrf_options, + }, }, } - - if isinstance(self.rrf, Dict): - query_body["rank"] = {"rrf": self.rrf} - elif isinstance(self.rrf, bool) and self.rrf is True: - query_body["rank"] = {"rrf": {}} - return query_body def needs_inference(self) -> bool: diff --git a/test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py b/test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py index 820746acd..85ae13d80 100644 --- a/test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py +++ b/test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py @@ -349,20 +349,48 @@ def test_search_knn_with_hybrid_search( def assert_query(query_body: dict, query: Optional[str]) -> dict: assert query_body == { - "knn": { - "field": "vector_field", - "filter": [], - "k": 1, - "num_candidates": 50, - "query_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0], - }, - "query": { - "bool": { - "filter": [], - "must": [{"match": {"text_field": {"query": "foo"}}}], + "retriever": { + "rrf": { + "retrievers": [ + { + "standard": { + "query": { + "bool": { + "filter": [], + "must": [ + { + "match": { + "text_field": {"query": "foo"} + } + } + ], + } + }, + }, + }, + { + "knn": { + "field": "vector_field", + "filter": [], + "k": 1, + "num_candidates": 50, + "query_vector": [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 0.0, + ], + }, + }, + ], } - }, - "rank": {"rrf": {}}, + } } return query_body @@ -381,36 +409,52 @@ def assert_query( expected_rrf: Union[dict, bool], ) -> dict: cmp_query_body = { - "knn": { - "field": "vector_field", - "filter": [], - "k": 3, - "num_candidates": 50, - "query_vector": [ - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 0.0, - ], - }, - "query": { - "bool": { - "filter": [], - "must": [{"match": {"text_field": {"query": "foo"}}}], + "retriever": { + "rrf": { + "retrievers": [ + { + "standard": { + "query": { + "bool": { + "filter": [], + "must": [ + { + "match": { + "text_field": {"query": "foo"} + } + } + ], + } + }, + }, + }, + { + "knn": { + "field": "vector_field", + "filter": [], + "k": 3, + "num_candidates": 50, + "query_vector": [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 0.0, + ], + }, + }, + ], } - }, + } } if isinstance(expected_rrf, dict): - cmp_query_body["rank"] = {"rrf": expected_rrf} - elif isinstance(expected_rrf, bool) and expected_rrf is True: - cmp_query_body["rank"] = {"rrf": {}} + cmp_query_body["retriever"]["rrf"].update(expected_rrf) assert query_body == cmp_query_body @@ -420,7 +464,7 @@ def assert_query( rrf_test_cases: List[Union[dict, bool]] = [ True, False, - {"rank_constant": 1, "window_size": 5}, + {"rank_constant": 1, "rank_window_size": 5}, ] for rrf_test_case in rrf_test_cases: store = VectorStore( @@ -441,21 +485,47 @@ def assert_query( # 2. check query result is okay es_output = store.client.search( index=index, - query={ - "bool": { - "filter": [], - "must": [{"match": {"text_field": {"query": "foo"}}}], + retriever={ + "rrf": { + "retrievers": [ + { + "knn": { + "field": "vector_field", + "filter": [], + "k": 3, + "num_candidates": 50, + "query_vector": [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 0.0, + ], + }, + }, + { + "standard": { + "query": { + "bool": { + "filter": [], + "must": [ + {"match": {"text_field": {"query": "foo"}}} + ], + } + }, + }, + }, + ], + "rank_constant": 1, + "rank_window_size": 5, } }, - knn={ - "field": "vector_field", - "filter": [], - "k": 3, - "num_candidates": 50, - "query_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0], - }, size=3, - rank={"rrf": {"rank_constant": 1, "window_size": 5}}, ) assert [o["_source"]["text_field"] for o in output] == [ From 203f67886dc63067eb2f19fdd9d69c786fdb1eba Mon Sep 17 00:00:00 2001 From: Miguel Grinberg Date: Mon, 30 Sep 2024 16:39:27 +0100 Subject: [PATCH 2/3] only run hybrid search tests when using a stack version >= 8.14 --- elasticsearch/helpers/vectorstore/_async/strategies.py | 2 +- elasticsearch/helpers/vectorstore/_sync/strategies.py | 2 +- .../test_server/test_vectorstore/test_vectorstore.py | 7 +++++++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/elasticsearch/helpers/vectorstore/_async/strategies.py b/elasticsearch/helpers/vectorstore/_async/strategies.py index 8f641f8f3..31db31bcf 100644 --- a/elasticsearch/helpers/vectorstore/_async/strategies.py +++ b/elasticsearch/helpers/vectorstore/_async/strategies.py @@ -291,7 +291,7 @@ def _hybrid( rrf_options["rank_constant"] = self.rrf["rank_constant"] if "window_size" in self.rrf: # 'window_size' was renamed to 'rank_window_size', but we support - # the older name for backwards compatiblit + # the older name for backwards compatibility rrf_options["rank_window_size"] = self.rrf["window_size"] if "rank_window_size" in self.rrf: rrf_options["rank_window_size"] = self.rrf["rank_window_size"] diff --git a/elasticsearch/helpers/vectorstore/_sync/strategies.py b/elasticsearch/helpers/vectorstore/_sync/strategies.py index 6029928af..6356d062a 100644 --- a/elasticsearch/helpers/vectorstore/_sync/strategies.py +++ b/elasticsearch/helpers/vectorstore/_sync/strategies.py @@ -291,7 +291,7 @@ def _hybrid( rrf_options["rank_constant"] = self.rrf["rank_constant"] if "window_size" in self.rrf: # 'window_size' was renamed to 'rank_window_size', but we support - # the older name for backwards compatiblit + # the older name for backwards compatibility rrf_options["rank_window_size"] = self.rrf["window_size"] if "rank_window_size" in self.rrf: rrf_options["rank_window_size"] = self.rrf["rank_window_size"] diff --git a/test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py b/test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py index 85ae13d80..141a1f318 100644 --- a/test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py +++ b/test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py @@ -33,6 +33,7 @@ VectorStore, ) from elasticsearch.helpers.vectorstore._sync._utils import model_is_deployed +from test_elasticsearch.utils import es_version from . import ConsistentFakeEmbeddings, FakeEmbeddings @@ -337,6 +338,9 @@ def test_search_knn_with_hybrid_search( self, sync_client: Elasticsearch, index: str ) -> None: """Test end to end construction and search with metadata.""" + if es_version(sync_client) < (8, 14): + pytest.skip("This test requires Elasticsearch 8.14 or newer") + store = VectorStore( index=index, retrieval_strategy=DenseVectorStrategy(hybrid=True), @@ -401,6 +405,9 @@ def test_search_knn_with_hybrid_search_rrf( self, sync_client: Elasticsearch, index: str ) -> None: """Test end to end construction and rrf hybrid search with metadata.""" + if es_version(sync_client) < (8, 14): + pytest.skip("This test requires Elasticsearch 8.14 or newer") + texts = ["foo", "bar", "baz"] def assert_query( From b9bf9521d076301baa369b1ee71de9e98ac636af Mon Sep 17 00:00:00 2001 From: Miguel Grinberg Date: Thu, 3 Oct 2024 17:31:21 +0100 Subject: [PATCH 3/3] add support for rrf=False back --- .../helpers/vectorstore/_async/strategies.py | 76 +++++++------- .../helpers/vectorstore/_sync/strategies.py | 76 +++++++------- .../test_vectorstore/test_vectorstore.py | 99 ++++++++++--------- 3 files changed, 133 insertions(+), 118 deletions(-) diff --git a/elasticsearch/helpers/vectorstore/_async/strategies.py b/elasticsearch/helpers/vectorstore/_async/strategies.py index 31db31bcf..10524e243 100644 --- a/elasticsearch/helpers/vectorstore/_async/strategies.py +++ b/elasticsearch/helpers/vectorstore/_async/strategies.py @@ -285,44 +285,50 @@ def _hybrid( # RRF is used to even the score from the knn query and text query # RRF has two optional parameters: {'rank_constant':int, 'rank_window_size':int} # https://www.elastic.co/guide/en/elasticsearch/reference/current/rrf.html - rrf_options = {} - if isinstance(self.rrf, Dict): - if "rank_constant" in self.rrf: - rrf_options["rank_constant"] = self.rrf["rank_constant"] - if "window_size" in self.rrf: - # 'window_size' was renamed to 'rank_window_size', but we support - # the older name for backwards compatibility - rrf_options["rank_window_size"] = self.rrf["window_size"] - if "rank_window_size" in self.rrf: - rrf_options["rank_window_size"] = self.rrf["rank_window_size"] - query_body = { - "retriever": { - "rrf": { - "retrievers": [ + standard_query = { + "query": { + "bool": { + "must": [ { - "standard": { - "query": { - "bool": { - "must": [ - { - "match": { - self.text_field: { - "query": query, - } - } - } - ], - "filter": filter, - } - }, - }, - }, - {"knn": knn}, + "match": { + self.text_field: { + "query": query, + } + } + } ], - **rrf_options, - }, - }, + "filter": filter, + } + } } + + if self.rrf is False: + query_body = { + "knn": knn, + **standard_query, + } + else: + rrf_options = {} + if isinstance(self.rrf, Dict): + if "rank_constant" in self.rrf: + rrf_options["rank_constant"] = self.rrf["rank_constant"] + if "window_size" in self.rrf: + # 'window_size' was renamed to 'rank_window_size', but we support + # the older name for backwards compatibility + rrf_options["rank_window_size"] = self.rrf["window_size"] + if "rank_window_size" in self.rrf: + rrf_options["rank_window_size"] = self.rrf["rank_window_size"] + query_body = { + "retriever": { + "rrf": { + "retrievers": [ + {"standard": standard_query}, + {"knn": knn}, + ], + **rrf_options, + }, + }, + } return query_body def needs_inference(self) -> bool: diff --git a/elasticsearch/helpers/vectorstore/_sync/strategies.py b/elasticsearch/helpers/vectorstore/_sync/strategies.py index 6356d062a..99c9baec2 100644 --- a/elasticsearch/helpers/vectorstore/_sync/strategies.py +++ b/elasticsearch/helpers/vectorstore/_sync/strategies.py @@ -285,44 +285,50 @@ def _hybrid( # RRF is used to even the score from the knn query and text query # RRF has two optional parameters: {'rank_constant':int, 'rank_window_size':int} # https://www.elastic.co/guide/en/elasticsearch/reference/current/rrf.html - rrf_options = {} - if isinstance(self.rrf, Dict): - if "rank_constant" in self.rrf: - rrf_options["rank_constant"] = self.rrf["rank_constant"] - if "window_size" in self.rrf: - # 'window_size' was renamed to 'rank_window_size', but we support - # the older name for backwards compatibility - rrf_options["rank_window_size"] = self.rrf["window_size"] - if "rank_window_size" in self.rrf: - rrf_options["rank_window_size"] = self.rrf["rank_window_size"] - query_body = { - "retriever": { - "rrf": { - "retrievers": [ + standard_query = { + "query": { + "bool": { + "must": [ { - "standard": { - "query": { - "bool": { - "must": [ - { - "match": { - self.text_field: { - "query": query, - } - } - } - ], - "filter": filter, - } - }, - }, - }, - {"knn": knn}, + "match": { + self.text_field: { + "query": query, + } + } + } ], - **rrf_options, - }, - }, + "filter": filter, + } + } } + + if self.rrf is False: + query_body = { + "knn": knn, + **standard_query, + } + else: + rrf_options = {} + if isinstance(self.rrf, Dict): + if "rank_constant" in self.rrf: + rrf_options["rank_constant"] = self.rrf["rank_constant"] + if "window_size" in self.rrf: + # 'window_size' was renamed to 'rank_window_size', but we support + # the older name for backwards compatibility + rrf_options["rank_window_size"] = self.rrf["window_size"] + if "rank_window_size" in self.rrf: + rrf_options["rank_window_size"] = self.rrf["rank_window_size"] + query_body = { + "retriever": { + "rrf": { + "retrievers": [ + {"standard": standard_query}, + {"knn": knn}, + ], + **rrf_options, + }, + }, + } return query_body def needs_inference(self) -> bool: diff --git a/test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py b/test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py index 141a1f318..096beaef5 100644 --- a/test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py +++ b/test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py @@ -415,64 +415,67 @@ def assert_query( query: Optional[str], expected_rrf: Union[dict, bool], ) -> dict: - cmp_query_body = { - "retriever": { - "rrf": { - "retrievers": [ - { - "standard": { - "query": { - "bool": { - "filter": [], - "must": [ - { - "match": { - "text_field": {"query": "foo"} - } - } - ], - } - }, - }, - }, - { - "knn": { - "field": "vector_field", - "filter": [], - "k": 3, - "num_candidates": 50, - "query_vector": [ - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 1.0, - 0.0, - ], - }, - }, - ], + standard_query = { + "query": { + "bool": { + "filter": [], + "must": [{"match": {"text_field": {"query": "foo"}}}], } } } + knn_query = { + "field": "vector_field", + "filter": [], + "k": 3, + "num_candidates": 50, + "query_vector": [ + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 1.0, + 0.0, + ], + } - if isinstance(expected_rrf, dict): - cmp_query_body["retriever"]["rrf"].update(expected_rrf) + if expected_rrf is not False: + cmp_query_body = { + "retriever": { + "rrf": { + "retrievers": [ + {"standard": standard_query}, + {"knn": knn_query}, + ], + } + } + } + if isinstance(expected_rrf, dict): + cmp_query_body["retriever"]["rrf"].update(expected_rrf) + else: + cmp_query_body = { + "knn": knn_query, + **standard_query, + } assert query_body == cmp_query_body return query_body # 1. check query_body is okay - rrf_test_cases: List[Union[dict, bool]] = [ - True, - False, - {"rank_constant": 1, "rank_window_size": 5}, - ] + if es_version(sync_client) >= (8, 14): + rrf_test_cases: List[Union[dict, bool]] = [ + True, + False, + {"rank_constant": 1, "rank_window_size": 5}, + ] + else: + # for 8.13.x and older there is no retriever query, so we can only + # run hybrid searches with rrf=False + rrf_test_cases: List[Union[dict, bool]] = [False] for rrf_test_case in rrf_test_cases: store = VectorStore( index=index,