From 32ab8317cbf3ef89a711a57090210e36ea49b1de Mon Sep 17 00:00:00 2001
From: Miguel Grinberg <miguel.grinberg@gmail.com>
Date: Mon, 30 Sep 2024 16:02:23 +0100
Subject: [PATCH 1/3] Vectorstore: use a retriever query for hybrid search

Fixes #2651
---
 .../helpers/vectorstore/_async/strategies.py  |  53 ++++--
 .../helpers/vectorstore/_sync/strategies.py   |  53 ++++--
 .../test_vectorstore/test_vectorstore.py      | 174 ++++++++++++------
 3 files changed, 190 insertions(+), 90 deletions(-)

diff --git a/elasticsearch/helpers/vectorstore/_async/strategies.py b/elasticsearch/helpers/vectorstore/_async/strategies.py
index a7f813f43..8f641f8f3 100644
--- a/elasticsearch/helpers/vectorstore/_async/strategies.py
+++ b/elasticsearch/helpers/vectorstore/_async/strategies.py
@@ -283,31 +283,46 @@ def _hybrid(
     ) -> Dict[str, Any]:
         # Add a query to the knn query.
         # RRF is used to even the score from the knn query and text query
-        # RRF has two optional parameters: {'rank_constant':int, 'window_size':int}
+        # RRF has two optional parameters: {'rank_constant':int, 'rank_window_size':int}
         # https://www.elastic.co/guide/en/elasticsearch/reference/current/rrf.html
+        rrf_options = {}
+        if isinstance(self.rrf, Dict):
+            if "rank_constant" in self.rrf:
+                rrf_options["rank_constant"] = self.rrf["rank_constant"]
+            if "window_size" in self.rrf:
+                # 'window_size' was renamed to 'rank_window_size', but we support
+                # the older name for backwards compatiblit
+                rrf_options["rank_window_size"] = self.rrf["window_size"]
+            if "rank_window_size" in self.rrf:
+                rrf_options["rank_window_size"] = self.rrf["rank_window_size"]
         query_body = {
-            "knn": knn,
-            "query": {
-                "bool": {
-                    "must": [
+            "retriever": {
+                "rrf": {
+                    "retrievers": [
                         {
-                            "match": {
-                                self.text_field: {
-                                    "query": query,
-                                }
-                            }
-                        }
+                            "standard": {
+                                "query": {
+                                    "bool": {
+                                        "must": [
+                                            {
+                                                "match": {
+                                                    self.text_field: {
+                                                        "query": query,
+                                                    }
+                                                }
+                                            }
+                                        ],
+                                        "filter": filter,
+                                    }
+                                },
+                            },
+                        },
+                        {"knn": knn},
                     ],
-                    "filter": filter,
-                }
+                    **rrf_options,
+                },
             },
         }
-
-        if isinstance(self.rrf, Dict):
-            query_body["rank"] = {"rrf": self.rrf}
-        elif isinstance(self.rrf, bool) and self.rrf is True:
-            query_body["rank"] = {"rrf": {}}
-
         return query_body
 
     def needs_inference(self) -> bool:
diff --git a/elasticsearch/helpers/vectorstore/_sync/strategies.py b/elasticsearch/helpers/vectorstore/_sync/strategies.py
index 928d34143..6029928af 100644
--- a/elasticsearch/helpers/vectorstore/_sync/strategies.py
+++ b/elasticsearch/helpers/vectorstore/_sync/strategies.py
@@ -283,31 +283,46 @@ def _hybrid(
     ) -> Dict[str, Any]:
         # Add a query to the knn query.
         # RRF is used to even the score from the knn query and text query
-        # RRF has two optional parameters: {'rank_constant':int, 'window_size':int}
+        # RRF has two optional parameters: {'rank_constant':int, 'rank_window_size':int}
         # https://www.elastic.co/guide/en/elasticsearch/reference/current/rrf.html
+        rrf_options = {}
+        if isinstance(self.rrf, Dict):
+            if "rank_constant" in self.rrf:
+                rrf_options["rank_constant"] = self.rrf["rank_constant"]
+            if "window_size" in self.rrf:
+                # 'window_size' was renamed to 'rank_window_size', but we support
+                # the older name for backwards compatiblit
+                rrf_options["rank_window_size"] = self.rrf["window_size"]
+            if "rank_window_size" in self.rrf:
+                rrf_options["rank_window_size"] = self.rrf["rank_window_size"]
         query_body = {
-            "knn": knn,
-            "query": {
-                "bool": {
-                    "must": [
+            "retriever": {
+                "rrf": {
+                    "retrievers": [
                         {
-                            "match": {
-                                self.text_field: {
-                                    "query": query,
-                                }
-                            }
-                        }
+                            "standard": {
+                                "query": {
+                                    "bool": {
+                                        "must": [
+                                            {
+                                                "match": {
+                                                    self.text_field: {
+                                                        "query": query,
+                                                    }
+                                                }
+                                            }
+                                        ],
+                                        "filter": filter,
+                                    }
+                                },
+                            },
+                        },
+                        {"knn": knn},
                     ],
-                    "filter": filter,
-                }
+                    **rrf_options,
+                },
             },
         }
-
-        if isinstance(self.rrf, Dict):
-            query_body["rank"] = {"rrf": self.rrf}
-        elif isinstance(self.rrf, bool) and self.rrf is True:
-            query_body["rank"] = {"rrf": {}}
-
         return query_body
 
     def needs_inference(self) -> bool:
diff --git a/test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py b/test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py
index 820746acd..85ae13d80 100644
--- a/test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py
+++ b/test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py
@@ -349,20 +349,48 @@ def test_search_knn_with_hybrid_search(
 
         def assert_query(query_body: dict, query: Optional[str]) -> dict:
             assert query_body == {
-                "knn": {
-                    "field": "vector_field",
-                    "filter": [],
-                    "k": 1,
-                    "num_candidates": 50,
-                    "query_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0],
-                },
-                "query": {
-                    "bool": {
-                        "filter": [],
-                        "must": [{"match": {"text_field": {"query": "foo"}}}],
+                "retriever": {
+                    "rrf": {
+                        "retrievers": [
+                            {
+                                "standard": {
+                                    "query": {
+                                        "bool": {
+                                            "filter": [],
+                                            "must": [
+                                                {
+                                                    "match": {
+                                                        "text_field": {"query": "foo"}
+                                                    }
+                                                }
+                                            ],
+                                        }
+                                    },
+                                },
+                            },
+                            {
+                                "knn": {
+                                    "field": "vector_field",
+                                    "filter": [],
+                                    "k": 1,
+                                    "num_candidates": 50,
+                                    "query_vector": [
+                                        1.0,
+                                        1.0,
+                                        1.0,
+                                        1.0,
+                                        1.0,
+                                        1.0,
+                                        1.0,
+                                        1.0,
+                                        1.0,
+                                        0.0,
+                                    ],
+                                },
+                            },
+                        ],
                     }
-                },
-                "rank": {"rrf": {}},
+                }
             }
             return query_body
 
@@ -381,36 +409,52 @@ def assert_query(
             expected_rrf: Union[dict, bool],
         ) -> dict:
             cmp_query_body = {
-                "knn": {
-                    "field": "vector_field",
-                    "filter": [],
-                    "k": 3,
-                    "num_candidates": 50,
-                    "query_vector": [
-                        1.0,
-                        1.0,
-                        1.0,
-                        1.0,
-                        1.0,
-                        1.0,
-                        1.0,
-                        1.0,
-                        1.0,
-                        0.0,
-                    ],
-                },
-                "query": {
-                    "bool": {
-                        "filter": [],
-                        "must": [{"match": {"text_field": {"query": "foo"}}}],
+                "retriever": {
+                    "rrf": {
+                        "retrievers": [
+                            {
+                                "standard": {
+                                    "query": {
+                                        "bool": {
+                                            "filter": [],
+                                            "must": [
+                                                {
+                                                    "match": {
+                                                        "text_field": {"query": "foo"}
+                                                    }
+                                                }
+                                            ],
+                                        }
+                                    },
+                                },
+                            },
+                            {
+                                "knn": {
+                                    "field": "vector_field",
+                                    "filter": [],
+                                    "k": 3,
+                                    "num_candidates": 50,
+                                    "query_vector": [
+                                        1.0,
+                                        1.0,
+                                        1.0,
+                                        1.0,
+                                        1.0,
+                                        1.0,
+                                        1.0,
+                                        1.0,
+                                        1.0,
+                                        0.0,
+                                    ],
+                                },
+                            },
+                        ],
                     }
-                },
+                }
             }
 
             if isinstance(expected_rrf, dict):
-                cmp_query_body["rank"] = {"rrf": expected_rrf}
-            elif isinstance(expected_rrf, bool) and expected_rrf is True:
-                cmp_query_body["rank"] = {"rrf": {}}
+                cmp_query_body["retriever"]["rrf"].update(expected_rrf)
 
             assert query_body == cmp_query_body
 
@@ -420,7 +464,7 @@ def assert_query(
         rrf_test_cases: List[Union[dict, bool]] = [
             True,
             False,
-            {"rank_constant": 1, "window_size": 5},
+            {"rank_constant": 1, "rank_window_size": 5},
         ]
         for rrf_test_case in rrf_test_cases:
             store = VectorStore(
@@ -441,21 +485,47 @@ def assert_query(
         # 2. check query result is okay
         es_output = store.client.search(
             index=index,
-            query={
-                "bool": {
-                    "filter": [],
-                    "must": [{"match": {"text_field": {"query": "foo"}}}],
+            retriever={
+                "rrf": {
+                    "retrievers": [
+                        {
+                            "knn": {
+                                "field": "vector_field",
+                                "filter": [],
+                                "k": 3,
+                                "num_candidates": 50,
+                                "query_vector": [
+                                    1.0,
+                                    1.0,
+                                    1.0,
+                                    1.0,
+                                    1.0,
+                                    1.0,
+                                    1.0,
+                                    1.0,
+                                    1.0,
+                                    0.0,
+                                ],
+                            },
+                        },
+                        {
+                            "standard": {
+                                "query": {
+                                    "bool": {
+                                        "filter": [],
+                                        "must": [
+                                            {"match": {"text_field": {"query": "foo"}}}
+                                        ],
+                                    }
+                                },
+                            },
+                        },
+                    ],
+                    "rank_constant": 1,
+                    "rank_window_size": 5,
                 }
             },
-            knn={
-                "field": "vector_field",
-                "filter": [],
-                "k": 3,
-                "num_candidates": 50,
-                "query_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0],
-            },
             size=3,
-            rank={"rrf": {"rank_constant": 1, "window_size": 5}},
         )
 
         assert [o["_source"]["text_field"] for o in output] == [

From 203f67886dc63067eb2f19fdd9d69c786fdb1eba Mon Sep 17 00:00:00 2001
From: Miguel Grinberg <miguel.grinberg@gmail.com>
Date: Mon, 30 Sep 2024 16:39:27 +0100
Subject: [PATCH 2/3] only run hybrid search tests when using a stack version
 >= 8.14

---
 elasticsearch/helpers/vectorstore/_async/strategies.py     | 2 +-
 elasticsearch/helpers/vectorstore/_sync/strategies.py      | 2 +-
 .../test_server/test_vectorstore/test_vectorstore.py       | 7 +++++++
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/elasticsearch/helpers/vectorstore/_async/strategies.py b/elasticsearch/helpers/vectorstore/_async/strategies.py
index 8f641f8f3..31db31bcf 100644
--- a/elasticsearch/helpers/vectorstore/_async/strategies.py
+++ b/elasticsearch/helpers/vectorstore/_async/strategies.py
@@ -291,7 +291,7 @@ def _hybrid(
                 rrf_options["rank_constant"] = self.rrf["rank_constant"]
             if "window_size" in self.rrf:
                 # 'window_size' was renamed to 'rank_window_size', but we support
-                # the older name for backwards compatiblit
+                # the older name for backwards compatibility
                 rrf_options["rank_window_size"] = self.rrf["window_size"]
             if "rank_window_size" in self.rrf:
                 rrf_options["rank_window_size"] = self.rrf["rank_window_size"]
diff --git a/elasticsearch/helpers/vectorstore/_sync/strategies.py b/elasticsearch/helpers/vectorstore/_sync/strategies.py
index 6029928af..6356d062a 100644
--- a/elasticsearch/helpers/vectorstore/_sync/strategies.py
+++ b/elasticsearch/helpers/vectorstore/_sync/strategies.py
@@ -291,7 +291,7 @@ def _hybrid(
                 rrf_options["rank_constant"] = self.rrf["rank_constant"]
             if "window_size" in self.rrf:
                 # 'window_size' was renamed to 'rank_window_size', but we support
-                # the older name for backwards compatiblit
+                # the older name for backwards compatibility
                 rrf_options["rank_window_size"] = self.rrf["window_size"]
             if "rank_window_size" in self.rrf:
                 rrf_options["rank_window_size"] = self.rrf["rank_window_size"]
diff --git a/test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py b/test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py
index 85ae13d80..141a1f318 100644
--- a/test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py
+++ b/test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py
@@ -33,6 +33,7 @@
     VectorStore,
 )
 from elasticsearch.helpers.vectorstore._sync._utils import model_is_deployed
+from test_elasticsearch.utils import es_version
 
 from . import ConsistentFakeEmbeddings, FakeEmbeddings
 
@@ -337,6 +338,9 @@ def test_search_knn_with_hybrid_search(
         self, sync_client: Elasticsearch, index: str
     ) -> None:
         """Test end to end construction and search with metadata."""
+        if es_version(sync_client) < (8, 14):
+            pytest.skip("This test requires Elasticsearch 8.14 or newer")
+
         store = VectorStore(
             index=index,
             retrieval_strategy=DenseVectorStrategy(hybrid=True),
@@ -401,6 +405,9 @@ def test_search_knn_with_hybrid_search_rrf(
         self, sync_client: Elasticsearch, index: str
     ) -> None:
         """Test end to end construction and rrf hybrid search with metadata."""
+        if es_version(sync_client) < (8, 14):
+            pytest.skip("This test requires Elasticsearch 8.14 or newer")
+
         texts = ["foo", "bar", "baz"]
 
         def assert_query(

From b9bf9521d076301baa369b1ee71de9e98ac636af Mon Sep 17 00:00:00 2001
From: Miguel Grinberg <miguel.grinberg@gmail.com>
Date: Thu, 3 Oct 2024 17:31:21 +0100
Subject: [PATCH 3/3] add support for rrf=False back

---
 .../helpers/vectorstore/_async/strategies.py  | 76 +++++++-------
 .../helpers/vectorstore/_sync/strategies.py   | 76 +++++++-------
 .../test_vectorstore/test_vectorstore.py      | 99 ++++++++++---------
 3 files changed, 133 insertions(+), 118 deletions(-)

diff --git a/elasticsearch/helpers/vectorstore/_async/strategies.py b/elasticsearch/helpers/vectorstore/_async/strategies.py
index 31db31bcf..10524e243 100644
--- a/elasticsearch/helpers/vectorstore/_async/strategies.py
+++ b/elasticsearch/helpers/vectorstore/_async/strategies.py
@@ -285,44 +285,50 @@ def _hybrid(
         # RRF is used to even the score from the knn query and text query
         # RRF has two optional parameters: {'rank_constant':int, 'rank_window_size':int}
         # https://www.elastic.co/guide/en/elasticsearch/reference/current/rrf.html
-        rrf_options = {}
-        if isinstance(self.rrf, Dict):
-            if "rank_constant" in self.rrf:
-                rrf_options["rank_constant"] = self.rrf["rank_constant"]
-            if "window_size" in self.rrf:
-                # 'window_size' was renamed to 'rank_window_size', but we support
-                # the older name for backwards compatibility
-                rrf_options["rank_window_size"] = self.rrf["window_size"]
-            if "rank_window_size" in self.rrf:
-                rrf_options["rank_window_size"] = self.rrf["rank_window_size"]
-        query_body = {
-            "retriever": {
-                "rrf": {
-                    "retrievers": [
+        standard_query = {
+            "query": {
+                "bool": {
+                    "must": [
                         {
-                            "standard": {
-                                "query": {
-                                    "bool": {
-                                        "must": [
-                                            {
-                                                "match": {
-                                                    self.text_field: {
-                                                        "query": query,
-                                                    }
-                                                }
-                                            }
-                                        ],
-                                        "filter": filter,
-                                    }
-                                },
-                            },
-                        },
-                        {"knn": knn},
+                            "match": {
+                                self.text_field: {
+                                    "query": query,
+                                }
+                            }
+                        }
                     ],
-                    **rrf_options,
-                },
-            },
+                    "filter": filter,
+                }
+            }
         }
+
+        if self.rrf is False:
+            query_body = {
+                "knn": knn,
+                **standard_query,
+            }
+        else:
+            rrf_options = {}
+            if isinstance(self.rrf, Dict):
+                if "rank_constant" in self.rrf:
+                    rrf_options["rank_constant"] = self.rrf["rank_constant"]
+                if "window_size" in self.rrf:
+                    # 'window_size' was renamed to 'rank_window_size', but we support
+                    # the older name for backwards compatibility
+                    rrf_options["rank_window_size"] = self.rrf["window_size"]
+                if "rank_window_size" in self.rrf:
+                    rrf_options["rank_window_size"] = self.rrf["rank_window_size"]
+            query_body = {
+                "retriever": {
+                    "rrf": {
+                        "retrievers": [
+                            {"standard": standard_query},
+                            {"knn": knn},
+                        ],
+                        **rrf_options,
+                    },
+                },
+            }
         return query_body
 
     def needs_inference(self) -> bool:
diff --git a/elasticsearch/helpers/vectorstore/_sync/strategies.py b/elasticsearch/helpers/vectorstore/_sync/strategies.py
index 6356d062a..99c9baec2 100644
--- a/elasticsearch/helpers/vectorstore/_sync/strategies.py
+++ b/elasticsearch/helpers/vectorstore/_sync/strategies.py
@@ -285,44 +285,50 @@ def _hybrid(
         # RRF is used to even the score from the knn query and text query
         # RRF has two optional parameters: {'rank_constant':int, 'rank_window_size':int}
         # https://www.elastic.co/guide/en/elasticsearch/reference/current/rrf.html
-        rrf_options = {}
-        if isinstance(self.rrf, Dict):
-            if "rank_constant" in self.rrf:
-                rrf_options["rank_constant"] = self.rrf["rank_constant"]
-            if "window_size" in self.rrf:
-                # 'window_size' was renamed to 'rank_window_size', but we support
-                # the older name for backwards compatibility
-                rrf_options["rank_window_size"] = self.rrf["window_size"]
-            if "rank_window_size" in self.rrf:
-                rrf_options["rank_window_size"] = self.rrf["rank_window_size"]
-        query_body = {
-            "retriever": {
-                "rrf": {
-                    "retrievers": [
+        standard_query = {
+            "query": {
+                "bool": {
+                    "must": [
                         {
-                            "standard": {
-                                "query": {
-                                    "bool": {
-                                        "must": [
-                                            {
-                                                "match": {
-                                                    self.text_field: {
-                                                        "query": query,
-                                                    }
-                                                }
-                                            }
-                                        ],
-                                        "filter": filter,
-                                    }
-                                },
-                            },
-                        },
-                        {"knn": knn},
+                            "match": {
+                                self.text_field: {
+                                    "query": query,
+                                }
+                            }
+                        }
                     ],
-                    **rrf_options,
-                },
-            },
+                    "filter": filter,
+                }
+            }
         }
+
+        if self.rrf is False:
+            query_body = {
+                "knn": knn,
+                **standard_query,
+            }
+        else:
+            rrf_options = {}
+            if isinstance(self.rrf, Dict):
+                if "rank_constant" in self.rrf:
+                    rrf_options["rank_constant"] = self.rrf["rank_constant"]
+                if "window_size" in self.rrf:
+                    # 'window_size' was renamed to 'rank_window_size', but we support
+                    # the older name for backwards compatibility
+                    rrf_options["rank_window_size"] = self.rrf["window_size"]
+                if "rank_window_size" in self.rrf:
+                    rrf_options["rank_window_size"] = self.rrf["rank_window_size"]
+            query_body = {
+                "retriever": {
+                    "rrf": {
+                        "retrievers": [
+                            {"standard": standard_query},
+                            {"knn": knn},
+                        ],
+                        **rrf_options,
+                    },
+                },
+            }
         return query_body
 
     def needs_inference(self) -> bool:
diff --git a/test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py b/test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py
index 141a1f318..096beaef5 100644
--- a/test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py
+++ b/test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py
@@ -415,64 +415,67 @@ def assert_query(
             query: Optional[str],
             expected_rrf: Union[dict, bool],
         ) -> dict:
-            cmp_query_body = {
-                "retriever": {
-                    "rrf": {
-                        "retrievers": [
-                            {
-                                "standard": {
-                                    "query": {
-                                        "bool": {
-                                            "filter": [],
-                                            "must": [
-                                                {
-                                                    "match": {
-                                                        "text_field": {"query": "foo"}
-                                                    }
-                                                }
-                                            ],
-                                        }
-                                    },
-                                },
-                            },
-                            {
-                                "knn": {
-                                    "field": "vector_field",
-                                    "filter": [],
-                                    "k": 3,
-                                    "num_candidates": 50,
-                                    "query_vector": [
-                                        1.0,
-                                        1.0,
-                                        1.0,
-                                        1.0,
-                                        1.0,
-                                        1.0,
-                                        1.0,
-                                        1.0,
-                                        1.0,
-                                        0.0,
-                                    ],
-                                },
-                            },
-                        ],
+            standard_query = {
+                "query": {
+                    "bool": {
+                        "filter": [],
+                        "must": [{"match": {"text_field": {"query": "foo"}}}],
                     }
                 }
             }
+            knn_query = {
+                "field": "vector_field",
+                "filter": [],
+                "k": 3,
+                "num_candidates": 50,
+                "query_vector": [
+                    1.0,
+                    1.0,
+                    1.0,
+                    1.0,
+                    1.0,
+                    1.0,
+                    1.0,
+                    1.0,
+                    1.0,
+                    0.0,
+                ],
+            }
 
-            if isinstance(expected_rrf, dict):
-                cmp_query_body["retriever"]["rrf"].update(expected_rrf)
+            if expected_rrf is not False:
+                cmp_query_body = {
+                    "retriever": {
+                        "rrf": {
+                            "retrievers": [
+                                {"standard": standard_query},
+                                {"knn": knn_query},
+                            ],
+                        }
+                    }
+                }
+                if isinstance(expected_rrf, dict):
+                    cmp_query_body["retriever"]["rrf"].update(expected_rrf)
+            else:
+                cmp_query_body = {
+                    "knn": knn_query,
+                    **standard_query,
+                }
 
             assert query_body == cmp_query_body
 
             return query_body
 
         # 1. check query_body is okay
-        rrf_test_cases: List[Union[dict, bool]] = [
-            True,
-            False,
-            {"rank_constant": 1, "rank_window_size": 5},
-        ]
+        if es_version(sync_client) >= (8, 14):
+            rrf_test_cases: List[Union[dict, bool]] = [
+                True,
+                False,
+                {"rank_constant": 1, "rank_window_size": 5},
+            ]
+        else:
+            # for 8.13.x and older there is no retriever query, so we can only
+            # run hybrid searches with rrf=False
+            rrf_test_cases: List[Union[dict, bool]] = [False]
         for rrf_test_case in rrf_test_cases:
             store = VectorStore(
                 index=index,