diff --git a/examples/astradb/README.md b/examples/astradb/README.md index 5dff7adbc3b19be0bf468650832d3c7b6bf310c6..78948903703791615928b0f22106b15b3840353f 100644 --- a/examples/astradb/README.md +++ b/examples/astradb/README.md @@ -14,18 +14,27 @@ Here are two sample scripts which work well with the sample data in the Astra Po - `ASTRA_DB_APPLICATION_TOKEN`: The generated app token for your Astra database - `ASTRA_DB_ENDPOINT`: The API endpoint for your Astra database +- `ASTRA_DB_NAMESPACE`: (Optional) The namespace where your collection is stored defaults to `default_keyspace` - `OPENAI_API_KEY`: Your OpenAI key 2. `cd` Into the `examples` directory 3. run `npm i` -## Load the data +## Example load and query + +Loads and queries a simple vectorstore with some documents about Astra DB + +run `ts-node astradb/example` + +## Movie Reviews Example + +### Load the data This sample loads the same dataset of movie reviews as the Astra Portal sample dataset. (Feel free to load the data in your the Astra Data Explorer to compare) run `ts-node astradb/load` -## Use RAG to Query the data +### Use RAG to Query the data Check out your data in the Astra Data Explorer and change the sample query as you see fit. diff --git a/examples/astradb/example.ts b/examples/astradb/example.ts new file mode 100644 index 0000000000000000000000000000000000000000..67b0170bc9728aa222549a6d9d64d2d41a634ec4 --- /dev/null +++ b/examples/astradb/example.ts @@ -0,0 +1,58 @@ +import { + AstraDBVectorStore, + Document, + storageContextFromDefaults, + VectorStoreIndex, +} from "llamaindex"; + +const collectionName = "test_collection"; + +async function main() { + try { + const docs = [ + new Document({ + text: "AstraDB is built on Apache Cassandra", + metadata: { + id: 123, + foo: "bar", + }, + }), + new Document({ + text: "AstraDB is a NoSQL DB", + metadata: { + id: 456, + foo: "baz", + }, + }), + new Document({ + text: "AstraDB supports vector search", + metadata: { + id: 789, + foo: "qux", + }, + }), + ]; + + const astraVS = new AstraDBVectorStore(); + await astraVS.create(collectionName, { + vector: { dimension: 1536, metric: "cosine" }, + }); + await astraVS.connect(collectionName); + + const ctx = await storageContextFromDefaults({ vectorStore: astraVS }); + const index = await VectorStoreIndex.fromDocuments(docs, { + storageContext: ctx, + }); + + const queryEngine = index.asQueryEngine(); + const response = await queryEngine.query({ + query: "Describe AstraDB.", + }); + + console.log(response.toString()); + } catch (e) { + console.error(e); + } +} + +main(); diff --git a/examples/astradb/load.ts b/examples/astradb/load.ts index 6c422fb145920e24563a7af5666285023023e1bc..d1982505b0bdf62bd17caa78cf6994e0fe5e9986 100644 --- a/examples/astradb/load.ts +++ b/examples/astradb/load.ts @@ -10,9 +10,9 @@ const collectionName = "movie_reviews"; async function main() { try { const reader = new PapaCSVReader(false); - const docs = await reader.loadData("../data/movie_reviews.csv"); + const docs = await reader.loadData("./data/movie_reviews.csv"); - const astraVS = new AstraDBVectorStore(); + const astraVS = new AstraDBVectorStore({ contentKey: "reviewtext" }); await astraVS.create(collectionName, { vector: { dimension: 1536, metric: "cosine" }, }); diff --git a/examples/astradb/query.ts b/examples/astradb/query.ts index 949cfd1188b7d10fab7276639a8b1f8a3574ad90..23985c0d253b6c1d7f46501d1ffcfd43289443c8 100644 --- a/examples/astradb/query.ts +++ b/examples/astradb/query.ts @@ -8,7 +8,7 @@ const collectionName = "movie_reviews"; async function main() { try { - const astraVS = new AstraDBVectorStore(); + const astraVS = new AstraDBVectorStore({ contentKey: "reviewtext" }); await astraVS.connect(collectionName); const ctx = serviceContextFromDefaults(); @@ -19,7 +19,7 @@ async function main() { const queryEngine = await index.asQueryEngine({ retriever }); const results = await queryEngine.query({ - query: "What is the best reviewed movie?", + query: 'How was "La Sapienza" reviewed?', }); console.log(results.response); diff --git a/packages/core/src/storage/vectorStore/AstraDBVectorStore.ts b/packages/core/src/storage/vectorStore/AstraDBVectorStore.ts index d78943e7293a55a9f9013c9c500cbe1cbbd3c973..ef9b3f02c018d2ea72e67e8f5a502a7edee37ed4 100644 --- a/packages/core/src/storage/vectorStore/AstraDBVectorStore.ts +++ b/packages/core/src/storage/vectorStore/AstraDBVectorStore.ts @@ -1,8 +1,9 @@ import { AstraDB } from "@datastax/astra-db-ts"; import { Collection } from "@datastax/astra-db-ts/dist/collections"; import { CreateCollectionOptions } from "@datastax/astra-db-ts/dist/collections/options"; -import { BaseNode, Document, MetadataMode } from "../../Node"; +import { BaseNode, MetadataMode } from "../../Node"; import { VectorStore, VectorStoreQuery, VectorStoreQueryResult } from "./types"; +import { metadataDictToNode, nodeToMetadata } from "./utils"; const MAX_INSERT_BATCH_SIZE = 20; @@ -12,7 +13,7 @@ export class AstraDBVectorStore implements VectorStore { astraDBClient: AstraDB; idKey: string; - contentKey: string | undefined; // if undefined the entirety of the node aside from the id and embedding will be stored as content + contentKey: string; metadataKey: string; private collection: Collection | undefined; @@ -22,6 +23,7 @@ export class AstraDBVectorStore implements VectorStore { params?: { token: string; endpoint: string; + namespace: string; }; }, ) { @@ -40,11 +42,15 @@ export class AstraDBVectorStore implements VectorStore { if (!endpoint) { throw new Error("Must specify ASTRA_DB_ENDPOINT via env variable."); } - this.astraDBClient = new AstraDB(token, endpoint); + const namespace = + init?.params?.namespace ?? + process.env.ASTRA_DB_NAMESPACE ?? + "default_keyspace"; + this.astraDBClient = new AstraDB(token, endpoint, namespace); } this.idKey = init?.idKey ?? "_id"; - this.contentKey = init?.contentKey; + this.contentKey = init?.contentKey ?? "content"; this.metadataKey = init?.metadataKey ?? "metadata"; } @@ -102,12 +108,20 @@ export class AstraDBVectorStore implements VectorStore { if (!nodes || nodes.length === 0) { return []; } + const dataToInsert = nodes.map((node) => { + const metadata = nodeToMetadata( + node, + true, + this.contentKey, + this.flatMetadata, + ); + return { - _id: node.id_, $vector: node.getEmbedding(), - content: node.getContent(MetadataMode.ALL), - metadata: node.metadata, + [this.idKey]: node.id_, + [this.contentKey]: node.getContent(MetadataMode.NONE), + [this.metadataKey]: metadata, }; }); @@ -122,11 +136,10 @@ export class AstraDBVectorStore implements VectorStore { for (const batch of batchData) { console.debug(`Inserting batch of size ${batch.length}`); - - const result = await collection.insertMany(batch); + await collection.insertMany(batch); } - return dataToInsert.map((node) => node._id); + return dataToInsert.map((node) => node?.[this.idKey] as string); } /** @@ -185,27 +198,24 @@ export class AstraDBVectorStore implements VectorStore { const similarities: number[] = []; await cursor.forEach(async (row: Record<string, any>) => { - const id = row[this.idKey]; - const embedding = row.$vector; - const similarity = row.$similarity; - const metadata = row[this.metadataKey]; - - // Remove fields from content - delete row[this.idKey]; - delete row.$similarity; - delete row.$vector; - delete row[this.metadataKey]; - - const content = this.contentKey - ? row[this.contentKey] - : JSON.stringify(row); - - const node = new Document({ - id_: id, - text: content, - metadata: metadata ?? {}, - embedding: embedding, + const { + $vector: embedding, + $similarity: similarity, + [this.idKey]: id, + [this.contentKey]: content, + [this.metadataKey]: metadata = {}, + ...rest + } = row; + + const node = metadataDictToNode(metadata, { + fallback: { + id, + text: content, + metadata, + ...rest, + }, }); + node.setContent(content); ids.push(id); similarities.push(similarity); diff --git a/packages/core/src/storage/vectorStore/utils.ts b/packages/core/src/storage/vectorStore/utils.ts index a20dbf7e69e005b9f0f887ab69eec291da652825..f0c2a512b8935854e6da0b6e9875afa1d0faea6c 100644 --- a/packages/core/src/storage/vectorStore/utils.ts +++ b/packages/core/src/storage/vectorStore/utils.ts @@ -36,7 +36,16 @@ export function nodeToMetadata( return metadata; } -export function metadataDictToNode(metadata: Metadata): BaseNode { +type MetadataDictToNodeOptions = { + // If the metadata doesn't contain node content, use this object as a fallback, for usage see + // AstraDBVectorStore.ts + fallback: Record<string, any>; +}; + +export function metadataDictToNode( + metadata: Metadata, + options?: MetadataDictToNodeOptions, +): BaseNode { const { _node_content: nodeContent, _node_type: nodeType, @@ -45,11 +54,17 @@ export function metadataDictToNode(metadata: Metadata): BaseNode { ref_doc_id, ...rest } = metadata; + let nodeObj; if (!nodeContent) { - throw new Error("Node content not found in metadata."); + if (options?.fallback) { + nodeObj = options?.fallback; + } else { + throw new Error("Node content not found in metadata."); + } + } else { + nodeObj = JSON.parse(nodeContent); + nodeObj.metadata = rest; } - const nodeObj = JSON.parse(nodeContent); - nodeObj.metadata = rest; // Note: we're using the name of the class stored in `_node_type` // and not the type attribute to reconstruct diff --git a/packages/eslint-config-custom/index.js b/packages/eslint-config-custom/index.js index ff53536b5dc7ef9b60c5f91780a5751b84a82c80..db2ba0f50ec343acaaedfeaef71f657606f5485b 100644 --- a/packages/eslint-config-custom/index.js +++ b/packages/eslint-config-custom/index.js @@ -14,6 +14,7 @@ module.exports = { "ASTRA_DB_APPLICATION_TOKEN", "ASTRA_DB_ENDPOINT", + "ASTRA_DB_NAMESPACE", "AZURE_OPENAI_KEY", "AZURE_OPENAI_ENDPOINT",