From 8cf92982dde986ce0bb5e36b702a3686bc8e8a3f Mon Sep 17 00:00:00 2001
From: Jules Kuehn <jk@jules.lol>
Date: Tue, 12 Mar 2024 22:02:01 -0400
Subject: [PATCH] fix(PGVectorStore): Use OR in text search; correct IN
 behaviour (#11872)

* fix(PGVectorStore): Invert IN operator behaviour

* test: update test_postgres assertion

* docs: install postgres step in postgres notebook

* docs: postgres hybrid search w/ QueryFusionRetriever

* fix: pgvectorstore bug (just introduced in this branch)

* add comments; auto-formatting

* auto-formatting ipynb
---
 docs/examples/vector_stores/postgres.ipynb    | 129 +++++++++++++++---
 .../vector_stores/postgres/base.py            |  36 +++--
 .../tests/test_postgres.py                    |   6 +-
 3 files changed, 136 insertions(+), 35 deletions(-)

diff --git a/docs/examples/vector_stores/postgres.ipynb b/docs/examples/vector_stores/postgres.ipynb
index 6643d7e835..659f6d436a 100644
--- a/docs/examples/vector_stores/postgres.ipynb
+++ b/docs/examples/vector_stores/postgres.ipynb
@@ -47,6 +47,30 @@
     "!pip install llama-index"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "eadf6b8a",
+   "metadata": {},
+   "source": [
+    "Running the following cell will install Postgres with PGVector in Colab."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ab46b5b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!sudo apt update\n",
+    "!sudo apt install -y postgresql-common\n",
+    "!echo | sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh\n",
+    "!sudo apt install postgresql-15-pgvector\n",
+    "!sudo service postgresql start\n",
+    "!sudo -u postgres psql -c \"ALTER USER postgres PASSWORD 'password';\"\n",
+    "!sudo -u postgres psql -c \"CREATE DATABASE vector_db;\""
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -104,7 +128,25 @@
    "execution_count": null,
    "id": "6df9fa89",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2024-03-12 21:24:54--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt\n",
+      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...\n",
+      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 75042 (73K) [text/plain]\n",
+      "Saving to: ‘data/paul_graham/paul_graham_essay.txt’\n",
+      "\n",
+      "data/paul_graham/pa 100%[===================>]  73.28K  --.-KB/s    in 0.002s  \n",
+      "\n",
+      "2024-03-12 21:24:54 (43.3 MB/s) - ‘data/paul_graham/paul_graham_essay.txt’ saved [75042/75042]\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "!mkdir -p 'data/paul_graham/'\n",
     "!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'"
@@ -130,7 +172,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Document ID: 88efac05-2277-4eda-a94c-c9247c9aca1c\n"
+      "Document ID: 4a53fcae-ca36-4492-aeeb-28b858516a49\n"
      ]
     }
    ],
@@ -185,7 +227,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "40b4c3becfc64c5184360b8b8e81ca9a",
+       "model_id": "55d642be4afd424dbeddcc98a6313baa",
        "version_major": 2,
        "version_minor": 0
       },
@@ -199,7 +241,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "37763ad7b17f4481a7e67df379304d31",
+       "model_id": "4bedc19d901346dbafbeff1d25638562",
        "version_major": 2,
        "version_minor": 0
       },
@@ -262,12 +304,12 @@
      "output_type": "stream",
      "text": [
       "The author worked on writing and programming before college. Initially, the author wrote short\n",
-      "stories and later started programming on an IBM 1401 using an early version of Fortran. The author\n",
-      "then transitioned to working with microcomputers, building a computer kit and eventually getting a\n",
-      "TRS-80 to further explore programming. In college, the author initially planned to study philosophy\n",
-      "but switched to studying AI due to a lack of interest in philosophy courses. The author was inspired\n",
-      "to work on AI after encountering works like Heinlein's novel \"The Moon is a Harsh Mistress\" and\n",
-      "seeing Terry Winograd using SHRDLU in a PBS documentary.\n"
+      "stories and later started programming on the IBM 1401 using an early version of Fortran. With the\n",
+      "introduction of microcomputers, the author's programming experiences changed, leading to the\n",
+      "creation of simple games, prediction programs, and a word processor. Despite initially planning to\n",
+      "study philosophy in college, the author eventually switched to studying AI due to a lack of interest\n",
+      "in philosophy courses. The author was inspired to work on AI after encountering the novel \"The Moon\n",
+      "is a Harsh Mistress\" and watching a PBS documentary featuring SHRDLU.\n"
      ]
     }
    ],
@@ -375,14 +417,8 @@
    "id": "55745895-8f01-4275-abaa-b2ebef2cb4c7",
    "metadata": {},
    "source": [
-    "### Hybrid Search  "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "91cae40f-3cd4-4403-8af4-aca2705e96a2",
-   "metadata": {},
-   "source": [
+    "### Hybrid Search\n",
+    "\n",
     "To enable hybrid search, you need to:\n",
     "1. pass in `hybrid_search=True` when constructing the `PGVectorStore` (and optionally configure `text_search_config` with the desired language)\n",
     "2. pass in `vector_store_query_mode=\"hybrid\"` when constructing the query engine (this config is passed to the retriever under the hood). You can also optionally set the `sparse_top_k` to configure how many results we should obtain from sparse text search (default is using the same value as `similarity_top_k`). "
@@ -423,7 +459,16 @@
    "execution_count": null,
    "id": "6f8edee4-6c19-4d99-b602-110bdc5708e5",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspaces/llama_index/llama-index-integrations/vector_stores/llama-index-vector-stores-postgres/llama_index/vector_stores/postgres/base.py:553: SAWarning: UserDefinedType REGCONFIG() will not produce a cache key because the ``cache_ok`` attribute is not set to True.  This can have significant performance implications including some performance degradations in comparison to prior SQLAlchemy versions.  Set this attribute to True if this type object's state is safe to use in a cache key, or False to disable this warning. (Background on this warning at: https://sqlalche.me/e/20/cprf)\n",
+      "  res = session.execute(stmt)\n"
+     ]
+    }
+   ],
    "source": [
     "hybrid_query_engine = hybrid_index.as_query_engine(\n",
     "    vector_store_query_mode=\"hybrid\", sparse_top_k=2\n",
@@ -451,6 +496,52 @@
     "print(hybrid_response)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "74ac0f05",
+   "metadata": {},
+   "source": [
+    "#### Improving hybrid search with QueryFusionRetriever\n",
+    "\n",
+    "Since the scores for text search and vector search are calculated differently, the nodes that were found only by text search will have a much lower score.\n",
+    "\n",
+    "You can often improve hybrid search performance by using `QueryFusionRetriever`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "345ba0fb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.response_synthesizers import CompactAndRefine\n",
+    "from llama_index.retrievers import QueryFusionRetriever\n",
+    "\n",
+    "vector_retriever = hybrid_query_engine.as_retriever(\n",
+    "    vector_store_query_mode=\"default\",\n",
+    "    similarity_top_k=5,\n",
+    ")\n",
+    "text_retriever = hybrid_query_engine.from_defaults(\n",
+    "    vector_store_query_mode=\"sparse\",\n",
+    "    similarity_top_k=5,  # interchangeable with sparse_top_k in this context\n",
+    ")\n",
+    "retriever = QueryFusionRetriever(\n",
+    "    [vector_retriever, text_retriever],\n",
+    "    similarity_top_k=5,\n",
+    "    num_queries=1,  # set this to 1 to disable query generation\n",
+    "    mode=\"relative_score\",\n",
+    "    use_async=False,\n",
+    "    verbose=True,\n",
+    ")\n",
+    "\n",
+    "response_synthesizer = CompactAndRefine()\n",
+    "query_engine = RetrieverQueryEngine(\n",
+    "    retriever=retriever,\n",
+    "    response_synthesizer=response_synthesizer,\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "2e5e8083",
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-postgres/llama_index/vector_stores/postgres/base.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-postgres/llama_index/vector_stores/postgres/base.py
index b7d114ddc3..194b5efd8d 100644
--- a/llama-index-integrations/vector_stores/llama-index-vector-stores-postgres/llama_index/vector_stores/postgres/base.py
+++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-postgres/llama_index/vector_stores/postgres/base.py
@@ -345,7 +345,7 @@ class PGVectorStore(BasePydanticVectorStore):
         elif operator == FilterOperator.LTE:
             return "<="
         elif operator == FilterOperator.IN:
-            return "@>"
+            return "IN"
         else:
             _logger.warning(f"Unknown operator: {operator}, fallback to '='")
             return "="
@@ -370,18 +370,17 @@ class PGVectorStore(BasePydanticVectorStore):
         return sqlalchemy_conditions[filters.condition](
             *(
                 (
-                    (
-                        sqlalchemy.text(
-                            f"metadata_::jsonb->'{filter_.key}' "
-                            f"{self._to_postgres_operator(filter_.operator)} "
-                            f"'[\"{filter_.value}\"]'"
-                        )
+                    sqlalchemy.text(
+                        f"metadata_->>'{filter_.key}' "
+                        f"{self._to_postgres_operator(filter_.operator)} "
+                        # Do not enclose the value in quotes for IN operator
+                        # because we expect a (list of values) as a string
+                        # with the values already enclosed in quotes
+                        # e.g. ('Jane', 'John', 'Jack')
+                        f"{filter_.value}"
                         if filter_.operator == FilterOperator.IN
-                        else sqlalchemy.text(
-                            f"metadata_->>'{filter_.key}' "
-                            f"{self._to_postgres_operator(filter_.operator)} "
-                            f"'{filter_.value}'"
-                        )
+                        # Enclose the value in quotes for other operators
+                        else f"'{filter_.value}'"
                     )
                     if not isinstance(filter_, MetadataFilters)
                     else self._recursively_apply_filters(filter_)
@@ -501,8 +500,17 @@ class PGVectorStore(BasePydanticVectorStore):
         if query_str is None:
             raise ValueError("query_str must be specified for a sparse vector query.")
 
-        ts_query = func.plainto_tsquery(
-            type_coerce(self.text_search_config, REGCONFIG), query_str
+        # Replace '&' with '|' to perform an OR search for higher recall
+        ts_query = func.to_tsquery(
+            func.replace(
+                func.text(
+                    func.plainto_tsquery(
+                        type_coerce(self.text_search_config, REGCONFIG), query_str
+                    )
+                ),
+                "&",
+                "|",
+            )
         )
         stmt = (
             select(  # type: ignore
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-postgres/tests/test_postgres.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-postgres/tests/test_postgres.py
index 0fd7d0f900..b10acaaa12 100644
--- a/llama-index-integrations/vector_stores/llama-index-vector-stores-postgres/tests/test_postgres.py
+++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-postgres/tests/test_postgres.py
@@ -269,7 +269,9 @@ async def test_add_to_db_and_query_with_metadata_filters_with_in_operator(
     filters = MetadataFilters(
         filters=[
             MetadataFilter(
-                key="test_key_list", value="test_value", operator=FilterOperator.IN
+                key="test_key",
+                value="('test_value', 'another_value')",
+                operator=FilterOperator.IN,
             )
         ]
     )
@@ -282,7 +284,7 @@ async def test_add_to_db_and_query_with_metadata_filters_with_in_operator(
         res = pg.query(q)
     assert res.nodes
     assert len(res.nodes) == 1
-    assert res.nodes[0].node_id == "ccc"
+    assert res.nodes[0].node_id == "bbb"
 
 
 @pytest.mark.skipif(postgres_not_available, reason="postgres db is not available")
-- 
GitLab