diff --git a/docs/examples/vector_stores/TypesenseDemo.ipynb b/docs/examples/vector_stores/TypesenseDemo.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..3d93252f27991247fe64c44eff43fc2a9bef8ce7 --- /dev/null +++ b/docs/examples/vector_stores/TypesenseDemo.ipynb @@ -0,0 +1,175 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "9c48213d-6e6a-4c10-838a-2a7c710c3a05", + "metadata": {}, + "source": [ + "# Typesense Vector Store" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "50d3b817-b70e-4667-be4f-d3a0fe4bd119", + "metadata": {}, + "source": [ + "#### Load documents, build the VectorStoreIndex" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "690a6918-7c75-4f95-9ccc-d2c4a1fe00d7", + "metadata": {}, + "outputs": [], + "source": [ + "# import logging\n", + "# import sys\n", + "\n", + "# logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", + "# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n", + "\n", + "from llama_index import VectorStoreIndex, SimpleDirectoryReader, StorageContext\n", + "from IPython.display import Markdown, display" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "03d1691e-544b-454f-825b-5ee12f7faa8a", + "metadata": {}, + "outputs": [], + "source": [ + "# load documents\n", + "documents = SimpleDirectoryReader('../../../examples/paul_graham_essay/data').load_data()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "ad144ee7-96da-4dd6-be00-fd6cf0c78e58", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "from llama_index.vector_stores.typesense import TypesenseVectorStore\n", + "from typesense import Client\n", + "\n", + "typesense_client = Client({\n", + " 'api_key': 'xyz',\n", + " 'nodes': [{\n", + " 'host': 'localhost',\n", + " 'port': '8108',\n", + " 'protocol': 'http'\n", + " }],\n", + " 'connection_timeout_seconds': 2\n", + "})\n", + "typesense_vector_store = TypesenseVectorStore(typesense_client)\n", + "storage_context = StorageContext.from_defaults(vector_store=typesense_vector_store)\n", + "\n", + "index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "b6caf93b-6345-4c65-a346-a95b0f1746c4", + "metadata": {}, + "source": [ + "#### Query Index" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "bdda1b2c-ae46-47cf-91d7-3153e8d0473b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "<b>\n", + "\n", + "The author grew up skipping a step in the evolution of computers, learning Italian, walking through Florence, painting people, working with technology companies, seeking signature styles at RISD, living in a rent-stabilized apartment, launching software, editing code (including Lisp expressions), writing essays, publishing them online, and receiving feedback from angry readers. He also experienced the exponential growth of commodity processors in the 1990s, which rolled up high-end, special-purpose hardware and software companies. He also learned how to make a little Italian go a long way by stringing together abstract concepts with a few simple verbs. He also experienced the tight coupling of money and coolness in the art world, and the fact that anything expensive comes to be seen as cool, and anything seen as cool will soon become equally expensive. He also experienced the challenge of launching software, as he had to recruit an initial set of users and make sure they had decent-looking stores before launching publicly. He also experienced the first instance of what is now a familiar experience, when he read the comments and found they were full of angry people. He also experienced the difference between putting something online and publishing it online. Finally, he wrote essays about topics he had stacked up, and wrote a more detailed version for others to read.</b>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from llama_index.indices.query.schema import QueryBundle \n", + "from llama_index.embeddings import OpenAIEmbedding\n", + "\n", + "# By default, typesense vector store uses vector search. You need to provide the embedding yourself.\n", + "query_str = \"What did the author do growing up?\"\n", + "embed_model = OpenAIEmbedding()\n", + "# If your service context has an embed_model you can also do:\n", + "# embed_model = index.service_context.embed_model\n", + "query_embedding = embed_model.get_agg_embedding_from_queries(query_str)\n", + "query_bundle = QueryBundle(query_str, embedding=query_embedding)\n", + "response = index.as_query_engine().query(query_bundle)\n", + "\n", + "display(Markdown(f\"<b>{response}</b>\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "751fb318", + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "<b>\n", + "\n", + "The author grew up during the Internet Bubble and was running a startup. They had to hire more people than they wanted to in order to seem more professional and were at the mercy of their investors until Yahoo bought them. They learned a lot about retail and startups, and had to do a lot of things that they weren't necessarily good at in order to make their business successful.</b>" + ], + "text/plain": [ + "<IPython.core.display.Markdown object>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from llama_index.vector_stores.types import VectorStoreQueryMode\n", + "\n", + "# You can also use text search\n", + "\n", + "query_bundle = QueryBundle(query_str = query_str)\n", + "response = index.as_query_engine(vector_store_query_mode=VectorStoreQueryMode.TEXT_SEARCH).query(query_bundle)\n", + "display(Markdown(f\"<b>{response}</b>\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/llama_index/vector_stores/types.py b/llama_index/vector_stores/types.py index 0576a29db953cfc23fefa77dd2552d221618c2b6..f77288f0ea62c9a0e121dc4708af308843c2e205 100644 --- a/llama_index/vector_stores/types.py +++ b/llama_index/vector_stores/types.py @@ -4,7 +4,7 @@ from enum import Enum from typing import Any, List, Optional, Protocol, Union, runtime_checkable import fsspec -from pydantic import BaseModel, StrictInt, StrictFloat, StrictStr +from pydantic import BaseModel, StrictFloat, StrictInt, StrictStr from llama_index.data_structs.node import Node @@ -49,6 +49,7 @@ class VectorStoreQueryMode(str, Enum): DEFAULT = "default" SPARSE = "sparse" HYBRID = "hybrid" + TEXT_SEARCH = "text_search" # fit learners SVM = "svm" diff --git a/llama_index/vector_stores/typesense.py b/llama_index/vector_stores/typesense.py new file mode 100644 index 0000000000000000000000000000000000000000..c79fc722623fe7c5f30226aec474ee9b3e8c7624 --- /dev/null +++ b/llama_index/vector_stores/typesense.py @@ -0,0 +1,255 @@ +"""Typesense Vector store index. + +An index that that is built on top of an existing vector store. + +""" + +import logging +from typing import Any, Callable, List, Optional, cast + +from llama_index import utils +from llama_index.data_structs.node import Node +from llama_index.vector_stores.types import ( + MetadataFilters, + NodeWithEmbedding, + VectorStore, + VectorStoreQuery, + VectorStoreQueryMode, + VectorStoreQueryResult, +) +from llama_index.vector_stores.utils import ( + DEFAULT_TEXT_KEY, + metadata_dict_to_node, + node_to_metadata_dict, +) + +_logger = logging.getLogger(__name__) + +DEFAULT_COLLECTION_NAME = "default_collection" +DEFAULT_BATCH_SIZE = 100 +DEFAULT_METADATA_KEY = "metadata" + + +class TypesenseVectorStore(VectorStore): + """Typesense Vector Store. + + In this vector store, embeddings and docs are stored within a + Typesense index. + + During query time, the index uses Typesense to query for the top + k most similar nodes. + + Args: + client (Any): Typesense client + tokenizer (Optional[Callable[[str], List]]): tokenizer function. + + """ + + stores_text: bool = True + is_embedding_query = False + + def __init__( + self, + client: Any, + tokenizer: Optional[Callable[[str], List]] = None, + text_key: str = DEFAULT_TEXT_KEY, + collection_name: str = DEFAULT_COLLECTION_NAME, + batch_size: int = DEFAULT_BATCH_SIZE, + metadata_key: str = DEFAULT_METADATA_KEY, + **kwargs: Any, + ) -> None: + """Initialize params.""" + import_err_msg = ( + "`typesense` package not found, please run `pip install typesense`" + ) + try: + import typesense # noqa: F401 + except ImportError: + raise ImportError(import_err_msg) + + if client is not None: + if not isinstance(client, typesense.Client): + raise ValueError( + f"client should be an instance of typesense.Client, " + f"got {type(client)}" + ) + self._client = cast(typesense.Client, client) + self._tokenizer = tokenizer or utils.globals_helper.tokenizer + self._text_key = text_key + self._collection_name = collection_name + self._collection = self._client.collections[self._collection_name] + self._batch_size = batch_size + self._metadata_key = metadata_key + + @property + def client(self) -> Any: + """Return Typesense client.""" + return self._client + + @property + def collection(self) -> Any: + """Return Typesense collection.""" + return self._collection + + def _create_collection(self, num_dim: int) -> None: + fields = [ + {"name": "vec", "type": "float[]", "num_dim": num_dim}, + {"name": f"{self._text_key}", "type": "string"}, + {"name": ".*", "type": "auto"}, + ] + self._client.collections.create( + {"name": self._collection_name, "fields": fields} + ) + + def _create_upsert_docs( + self, embedding_results: List[NodeWithEmbedding] + ) -> List[dict]: + upsert_docs = [] + for node in embedding_results: + doc = { + "id": node.id, + "vec": node.embedding, + f"{self._text_key}": node.node.text, + "ref_doc_id": node.ref_doc_id, + f"{self._metadata_key}": node_to_metadata_dict(node.node), + } + upsert_docs.append(doc) + + return upsert_docs + + @staticmethod + def _to_typesense_filter(standard_filters: MetadataFilters) -> str: + """Convert from standard dataclass to typesense filter dict.""" + for filter in standard_filters.filters: + if filter.key == "filter_by": + return str(filter.value) + + return "" + + def add( + self, + embedding_results: List[NodeWithEmbedding], + ) -> List[str]: + """Add embedding results to index. + + Args + embedding_results: List[NodeWithEmbedding]: list of embedding results + + """ + from typesense.collection import Collection + from typesense.exceptions import ObjectNotFound + + docs = self._create_upsert_docs(embedding_results) + + try: + collection = cast(Collection, self.collection) + collection.documents.import_( + docs, {"action": "upsert"}, batch_size=self._batch_size + ) + except ObjectNotFound: + # Create the collection if it doesn't already exist + num_dim = len(embedding_results[0].embedding) + self._create_collection(num_dim) + collection.documents.import_( + docs, {"action": "upsert"}, batch_size=self._batch_size + ) + + return [result.id for result in embedding_results] + + def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None: + """ + Delete nodes using with ref_doc_id. + + Args: + ref_doc_id (str): The doc_id of the document to delete. + + """ + + try: + from typesense.collection import Collection + + collection = cast(Collection, self.collection) + except ImportError: + raise ImportError("Typesense not found. Please run `pip install typesense`") + + collection.documents.delete({"filter_by": f"ref_doc_id:={ref_doc_id}"}) + + def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult: + """Query Typesense index for top k most similar nodes. + + Args: + query (VectorStoreQuery): Vector store query object. + + """ + + if query.filters: + typesense_filter = self._to_typesense_filter(query.filters) + else: + typesense_filter = "" + + if query.mode is not VectorStoreQueryMode.TEXT_SEARCH: + if query.query_embedding: + embedded_query = [str(x) for x in query.query_embedding] + search_requests = { + "searches": [ + { + "collection": self._collection_name, + "q": "*", + "vector_query": f'vec:([{",".join(embedded_query)}],' + + f"k:{query.similarity_top_k})", + "filter_by": typesense_filter, + } + ] + } + else: + raise ValueError("Vector search requires a query embedding") + if query.mode is VectorStoreQueryMode.TEXT_SEARCH: + if query.query_str: + search_requests = { + "searches": [ + { + "collection": self._collection_name, + "q": query.query_str, + "query_by": self._text_key, + "filter_by": typesense_filter, + } + ] + } + else: + raise ValueError("Text search requires a query string") + response = self._client.multi_search.perform(search_requests, {}) + + top_k_nodes = [] + top_k_ids = [] + top_k_scores = None + if query.mode is not VectorStoreQueryMode.TEXT_SEARCH: + top_k_scores = [] + + for hit in response["results"][0]["hits"]: + document = hit["document"] + id = document["id"] + text = document[self._text_key] + extra_info, node_info, relationships = metadata_dict_to_node( + document[self._metadata_key], text_key=self._text_key + ) + + # Note that typesense distances range from 0 to 2, \ + # where 0 is most similar and 2 is most dissimilar + if query.mode is not VectorStoreQueryMode.TEXT_SEARCH: + score = hit["vector_distance"] + + node = Node( + text=text, + doc_id=id, + extra_info=extra_info, + node_info=node_info, + relationships=relationships, + ) + top_k_ids.append(id) + top_k_nodes.append(node) + if query.mode is not VectorStoreQueryMode.TEXT_SEARCH: + top_k_scores.append(score) + + return VectorStoreQueryResult( + nodes=top_k_nodes, similarities=top_k_scores, ids=top_k_ids + )