diff --git a/examples/paul_graham_essay/ColbertIndex.ipynb b/examples/paul_graham_essay/ColbertIndex.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..1970bcd5b7f4b0b895ab466c8b226c5e3ba2704c --- /dev/null +++ b/examples/paul_graham_essay/ColbertIndex.ipynb @@ -0,0 +1,110 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "636497e9-825c-43b1-b2ee-dc483c110a70", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "sys.path.append(\"../..\") # make it possible to import from the experimental dir" + ] + }, + { + "cell_type": "markdown", + "id": "7dce46c9-2a8e-43f8-ad3f-d410ba1445f7", + "metadata": {}, + "source": [ + "ColBERT is currently an experimental feature. ColBERT is a neural retrieval method that tends to work well in a zero-shot setting on out of domain datasets, due to it's use of token-level encodings (rather than at the sentence or chunk level)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4a52cb17-6cac-41e0-8d74-1d4ef03ee6e3", + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index import SimpleDirectoryReader, ServiceContext\n", + "from experimental.colbert_index import ColbertIndex\n", + "from llama_index.llms import OpenAI" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "10cb46a9-1cd6-4eff-9aee-34861aa43bef", + "metadata": {}, + "outputs": [], + "source": [ + "llm = OpenAI(temperature=0, model=\"gpt-3.5-turbo\")\n", + "service_context = ServiceContext.from_defaults(llm=llm, embed_model=\"local\")\n", + "\n", + "documents = SimpleDirectoryReader(\"data\").load_data()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ffaa8b5-2d82-4544-a54d-efe27fcce538", + "metadata": {}, + "outputs": [], + "source": [ + "index = ColbertIndex.from_documents(\n", + " documents=documents, service_context=service_context\n", + ")\n", + "query_engine = index.as_query_engine()\n", + "response = query_engine.query(\"What did the author do after his time at Y Combinator?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "5f483a87-e5a0-456f-9b71-1b747a256627", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "After his time at Y Combinator, the author moved back to Providence to continue his studies at RISD. He then moved to New York City and took up residence in Yorkville, where he began to make paintings and pursue his career as an artist.\n" + ] + } + ], + "source": [ + "print(response.response)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c02ec04-1f7d-4d60-a9f9-742a32c9ec97", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/experimental/colbert_index/__init__.py b/experimental/colbert_index/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..119f0261ff8eeb8d32456ee016f72c2b35e6939a --- /dev/null +++ b/experimental/colbert_index/__init__.py @@ -0,0 +1,4 @@ +from .base import ColbertIndex +from .retriever import ColbertRetriever + +__all__ = ["ColbertIndex", "ColbertRetriever"] diff --git a/experimental/colbert_index/base.py b/experimental/colbert_index/base.py new file mode 100644 index 0000000000000000000000000000000000000000..9f641113021bb6bf2831f576d99a2d01287405ac --- /dev/null +++ b/experimental/colbert_index/base.py @@ -0,0 +1,158 @@ +from typing import Any, List, Optional, Sequence, Dict + +from llama_index.data_structs.data_structs import IndexDict +from llama_index.indices.base_retriever import BaseRetriever +from llama_index.indices.service_context import ServiceContext +from llama_index.storage.storage_context import StorageContext +from llama_index.indices.base import BaseIndex +from llama_index.storage.docstore.types import RefDocInfo +from llama_index.schema import BaseNode, NodeWithScore + +# TODO(jon-chuang): +# 1. Add support for updating index (inserts/deletes) +# 2. Add proper support for storage (managing/loading from the index files) +# 3. Normalize scores (not sure what the best practice is here) + + +class ColbertIndex(BaseIndex[IndexDict]): + """ + Store for ColBERT v2 with PLAID indexing. + + ColBERT is a neural retrieval method that tends to work + well in a zero-shot setting on out of domain datasets, due + to it's use of token-level encodings (rather than sentence or + chunk level) + + Parameters: + + index_path: directory containing PLAID index files. + model_name: ColBERT hugging face model name. + Default: "colbert-ir/colbertv2.0". + show_progress: whether to show progress bar when building index. + Default: False. noop for ColBERT for now. + nbits: number of bits to quantize the residual vectors. Default: 2. + kmeans_niters: number of kmeans clustering iterations. Default: 1. + gpus: number of GPUs to use for indexing. Default: 0. + rank: number of ranks to use for indexing. Default: 1. + doc_maxlen: max document length. Default: 120. + query_maxlen: max query length. Default: 60. + kmeans_niters: number of kmeans iterations. Default: 4. + + """ + + def __init__( + self, + nodes: Optional[Sequence[BaseNode]] = None, + index_struct: Optional[IndexDict] = None, + service_context: Optional[ServiceContext] = None, + storage_context: Optional[StorageContext] = None, + model_name: str = "colbert-ir/colbertv2.0", + show_progress: bool = False, + nbits: int = 2, + gpus: int = 0, + ranks: int = 1, + doc_maxlen: int = 120, + query_maxlen: int = 60, + kmeans_niters: int = 4, + **kwargs: Any, + ) -> None: + self.model_name = model_name + self.index_path = "storage/colbert_index" + self.nbits = nbits + self.gpus = gpus + self.ranks = ranks + self.doc_maxlen = doc_maxlen + self.query_maxlen = query_maxlen + self.kmeans_niters = kmeans_niters + self._docs_pos_to_node_id: Dict[int, str] = {} + try: + pass + except ImportError as exc: + raise ImportError( + "Please install colbert to use this feature from the repo:", + "https://github.com/stanford-futuredata/ColBERT", + ) from exc + super().__init__( + nodes=nodes, + index_struct=index_struct, + service_context=service_context, + storage_context=storage_context, + show_progress=show_progress, + **kwargs, + ) + + def _insert(self, nodes: Sequence[BaseNode], **insert_kwargs: Any) -> None: + raise NotImplementedError("ColbertStoreIndex does not support insertion yet.") + + def _delete_node(self, node_id: str, **delete_kwargs: Any) -> None: + raise NotImplementedError("ColbertStoreIndex does not support deletion yet.") + + def as_retriever(self, **kwargs: Any) -> BaseRetriever: + from .retriever import ColbertRetriever + + return ColbertRetriever(index=self, **kwargs) + + @property + def ref_doc_info(self) -> Dict[str, RefDocInfo]: + raise NotImplementedError("ColbertStoreIndex does not support ref_doc_info.") + + def _build_index_from_nodes(self, nodes: Sequence[BaseNode]) -> IndexDict: + """Generate a PLAID index from the ColBERT checkpoint via its hugging face + model_name. + """ + + from colbert import Indexer, Searcher, IndexUpdater + from colbert.infra import ColBERTConfig, Run, RunConfig + + index_struct = IndexDict() + + docs_list = [] + for i, node in enumerate(nodes): + docs_list.append(node.get_content()) + self._docs_pos_to_node_id[i] = node.node_id + index_struct.add_node(node, text_id=str(i)) + + with Run().context( + RunConfig(index_root=self.index_path, nranks=self.ranks, gpus=self.gpus) + ): + config = ColBERTConfig( + doc_maxlen=self.doc_maxlen, + query_maxlen=self.query_maxlen, + nbits=self.nbits, + kmeans_niters=self.kmeans_niters, + ) + indexer = Indexer(checkpoint=self.model_name, config=config) + indexer.index("", collection=docs_list, overwrite=True) + self.store = Searcher( + index="", collection=docs_list, checkpoint=self.model_name + ) + self.updater = IndexUpdater( + config=config, searcher=self.store, checkpoint=self.model_name + ) + return index_struct + + # @staticmethod + # def _normalize_scores(docs: List[Document]) -> None: + # "Normalizing the MaxSim scores using softmax." + # Z = sum(math.exp(doc.score) for doc in docs) + # for doc in docs: + # doc.score = math.exp(doc.score) / Z + + def query(self, query_str: str, top_k: int = 10) -> List[NodeWithScore]: + """ + Query the Colbert v2 + Plaid store. + + Returns: list of NodeWithScore. + """ + + doc_ids, _, scores = self.store.search(text=query_str, k=top_k) + + node_doc_ids = list(map(lambda id: self._docs_pos_to_node_id[id], doc_ids)) + nodes = self.docstore.get_nodes(node_doc_ids) + + nodes_with_score = [] + + for node, score in zip(nodes, scores): + nodes_with_score.append(NodeWithScore(node=node, score=score)) + + return nodes_with_score diff --git a/experimental/colbert_index/retriever.py b/experimental/colbert_index/retriever.py new file mode 100644 index 0000000000000000000000000000000000000000..fdba0e363733c09e8dc6330ba55d387bb1df9143 --- /dev/null +++ b/experimental/colbert_index/retriever.py @@ -0,0 +1,53 @@ +from typing import Optional, Dict, List, Any + +from llama_index.indices.base_retriever import BaseRetriever +from llama_index.constants import DEFAULT_SIMILARITY_TOP_K +from .base import ColbertIndex +from llama_index.schema import NodeWithScore +from llama_index.indices.query.schema import QueryBundle +from llama_index.vector_stores.types import MetadataFilters + + +class ColbertRetriever(BaseRetriever): + """Vector index retriever. + + Args: + index (ColbertIndex): Colbert index. + similarity_top_k (int): number of top k results to return. + filters (Optional[MetadataFilters]): metadata filters, defaults to None + doc_ids (Optional[List[str]]): list of documents to constrain search. + colbert_kwargs (dict): Additional colbert specific kwargs to pass + through to the colbert index at query time. + + """ + + def __init__( + self, + index: ColbertIndex, + similarity_top_k: int = DEFAULT_SIMILARITY_TOP_K, + filters: Optional[MetadataFilters] = None, + node_ids: Optional[List[str]] = None, + doc_ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> None: + """Initialize params.""" + self._index = index + self._service_context = self._index.service_context + self._docstore = self._index.docstore + + self._similarity_top_k = similarity_top_k + self._node_ids = node_ids + self._doc_ids = doc_ids + self._filters = filters + + self._kwargs: Dict[str, Any] = kwargs.get("colbert_kwargs", {}) + + def _retrieve( + self, + query_bundle: QueryBundle, + ) -> List[NodeWithScore]: + return self._index.query( + query_str=query_bundle.query_str, + top_k=self._similarity_top_k, + **self._kwargs, + ) diff --git a/llama_index/indices/base.py b/llama_index/indices/base.py index cba17284585118e1cfba2ee1ae78939e645cf2a5..737ec462052060a5252791bad3d32c30b9272ab5 100644 --- a/llama_index/indices/base.py +++ b/llama_index/indices/base.py @@ -328,7 +328,7 @@ class BaseIndex(Generic[IS], ABC): @abstractmethod def as_retriever(self, **kwargs: Any) -> BaseRetriever: - pass + ... def as_query_engine(self, **kwargs: Any) -> BaseQueryEngine: # NOTE: lazy import