From bd08004afe9fccbf5c8470eb12817a989997d1dc Mon Sep 17 00:00:00 2001
From: Mike Fortman <michael.fortman@datastax.com>
Date: Mon, 5 Feb 2024 22:31:08 -0600
Subject: [PATCH] Update Astra DB Vectorstore to support namespaces (#485)

Co-authored-by: Marcus Schiesser <mail@marcusschiesser.de>
---
 examples/astradb/README.md                    | 13 +++-
 examples/astradb/example.ts                   | 58 +++++++++++++++
 examples/astradb/load.ts                      |  4 +-
 examples/astradb/query.ts                     |  4 +-
 .../storage/vectorStore/AstraDBVectorStore.ts | 70 +++++++++++--------
 .../core/src/storage/vectorStore/utils.ts     | 23 ++++--
 packages/eslint-config-custom/index.js        |  1 +
 7 files changed, 133 insertions(+), 40 deletions(-)
 create mode 100644 examples/astradb/example.ts

diff --git a/examples/astradb/README.md b/examples/astradb/README.md
index 5dff7adbc..789489037 100644
--- a/examples/astradb/README.md
+++ b/examples/astradb/README.md
@@ -14,18 +14,27 @@ Here are two sample scripts which work well with the sample data in the Astra Po
 
 - `ASTRA_DB_APPLICATION_TOKEN`: The generated app token for your Astra database
 - `ASTRA_DB_ENDPOINT`: The API endpoint for your Astra database
+- `ASTRA_DB_NAMESPACE`: (Optional) The namespace where your collection is stored defaults to `default_keyspace`
 - `OPENAI_API_KEY`: Your OpenAI key
 
 2. `cd` Into the `examples` directory
 3. run `npm i`
 
-## Load the data
+## Example load and query
+
+Loads and queries a simple vectorstore with some documents about Astra DB
+
+run `ts-node astradb/example`
+
+## Movie Reviews Example
+
+### Load the data
 
 This sample loads the same dataset of movie reviews as the Astra Portal sample dataset. (Feel free to load the data in your the Astra Data Explorer to compare)
 
 run `ts-node astradb/load`
 
-## Use RAG to Query the data
+### Use RAG to Query the data
 
 Check out your data in the Astra Data Explorer and change the sample query as you see fit.
 
diff --git a/examples/astradb/example.ts b/examples/astradb/example.ts
new file mode 100644
index 000000000..67b0170bc
--- /dev/null
+++ b/examples/astradb/example.ts
@@ -0,0 +1,58 @@
+import {
+  AstraDBVectorStore,
+  Document,
+  storageContextFromDefaults,
+  VectorStoreIndex,
+} from "llamaindex";
+
+const collectionName = "test_collection";
+
+async function main() {
+  try {
+    const docs = [
+      new Document({
+        text: "AstraDB is built on Apache Cassandra",
+        metadata: {
+          id: 123,
+          foo: "bar",
+        },
+      }),
+      new Document({
+        text: "AstraDB is a NoSQL DB",
+        metadata: {
+          id: 456,
+          foo: "baz",
+        },
+      }),
+      new Document({
+        text: "AstraDB supports vector search",
+        metadata: {
+          id: 789,
+          foo: "qux",
+        },
+      }),
+    ];
+
+    const astraVS = new AstraDBVectorStore();
+    await astraVS.create(collectionName, {
+      vector: { dimension: 1536, metric: "cosine" },
+    });
+    await astraVS.connect(collectionName);
+
+    const ctx = await storageContextFromDefaults({ vectorStore: astraVS });
+    const index = await VectorStoreIndex.fromDocuments(docs, {
+      storageContext: ctx,
+    });
+
+    const queryEngine = index.asQueryEngine();
+    const response = await queryEngine.query({
+      query: "Describe AstraDB.",
+    });
+
+    console.log(response.toString());
+  } catch (e) {
+    console.error(e);
+  }
+}
+
+main();
diff --git a/examples/astradb/load.ts b/examples/astradb/load.ts
index 6c422fb14..d1982505b 100644
--- a/examples/astradb/load.ts
+++ b/examples/astradb/load.ts
@@ -10,9 +10,9 @@ const collectionName = "movie_reviews";
 async function main() {
   try {
     const reader = new PapaCSVReader(false);
-    const docs = await reader.loadData("../data/movie_reviews.csv");
+    const docs = await reader.loadData("./data/movie_reviews.csv");
 
-    const astraVS = new AstraDBVectorStore();
+    const astraVS = new AstraDBVectorStore({ contentKey: "reviewtext" });
     await astraVS.create(collectionName, {
       vector: { dimension: 1536, metric: "cosine" },
     });
diff --git a/examples/astradb/query.ts b/examples/astradb/query.ts
index 949cfd118..23985c0d2 100644
--- a/examples/astradb/query.ts
+++ b/examples/astradb/query.ts
@@ -8,7 +8,7 @@ const collectionName = "movie_reviews";
 
 async function main() {
   try {
-    const astraVS = new AstraDBVectorStore();
+    const astraVS = new AstraDBVectorStore({ contentKey: "reviewtext" });
     await astraVS.connect(collectionName);
 
     const ctx = serviceContextFromDefaults();
@@ -19,7 +19,7 @@ async function main() {
     const queryEngine = await index.asQueryEngine({ retriever });
 
     const results = await queryEngine.query({
-      query: "What is the best reviewed movie?",
+      query: 'How was "La Sapienza" reviewed?',
     });
 
     console.log(results.response);
diff --git a/packages/core/src/storage/vectorStore/AstraDBVectorStore.ts b/packages/core/src/storage/vectorStore/AstraDBVectorStore.ts
index d78943e72..ef9b3f02c 100644
--- a/packages/core/src/storage/vectorStore/AstraDBVectorStore.ts
+++ b/packages/core/src/storage/vectorStore/AstraDBVectorStore.ts
@@ -1,8 +1,9 @@
 import { AstraDB } from "@datastax/astra-db-ts";
 import { Collection } from "@datastax/astra-db-ts/dist/collections";
 import { CreateCollectionOptions } from "@datastax/astra-db-ts/dist/collections/options";
-import { BaseNode, Document, MetadataMode } from "../../Node";
+import { BaseNode, MetadataMode } from "../../Node";
 import { VectorStore, VectorStoreQuery, VectorStoreQueryResult } from "./types";
+import { metadataDictToNode, nodeToMetadata } from "./utils";
 
 const MAX_INSERT_BATCH_SIZE = 20;
 
@@ -12,7 +13,7 @@ export class AstraDBVectorStore implements VectorStore {
 
   astraDBClient: AstraDB;
   idKey: string;
-  contentKey: string | undefined; // if undefined the entirety of the node aside from the id and embedding will be stored as content
+  contentKey: string;
   metadataKey: string;
 
   private collection: Collection | undefined;
@@ -22,6 +23,7 @@ export class AstraDBVectorStore implements VectorStore {
       params?: {
         token: string;
         endpoint: string;
+        namespace: string;
       };
     },
   ) {
@@ -40,11 +42,15 @@ export class AstraDBVectorStore implements VectorStore {
       if (!endpoint) {
         throw new Error("Must specify ASTRA_DB_ENDPOINT via env variable.");
       }
-      this.astraDBClient = new AstraDB(token, endpoint);
+      const namespace =
+        init?.params?.namespace ??
+        process.env.ASTRA_DB_NAMESPACE ??
+        "default_keyspace";
+      this.astraDBClient = new AstraDB(token, endpoint, namespace);
     }
 
     this.idKey = init?.idKey ?? "_id";
-    this.contentKey = init?.contentKey;
+    this.contentKey = init?.contentKey ?? "content";
     this.metadataKey = init?.metadataKey ?? "metadata";
   }
 
@@ -102,12 +108,20 @@ export class AstraDBVectorStore implements VectorStore {
     if (!nodes || nodes.length === 0) {
       return [];
     }
+
     const dataToInsert = nodes.map((node) => {
+      const metadata = nodeToMetadata(
+        node,
+        true,
+        this.contentKey,
+        this.flatMetadata,
+      );
+
       return {
-        _id: node.id_,
         $vector: node.getEmbedding(),
-        content: node.getContent(MetadataMode.ALL),
-        metadata: node.metadata,
+        [this.idKey]: node.id_,
+        [this.contentKey]: node.getContent(MetadataMode.NONE),
+        [this.metadataKey]: metadata,
       };
     });
 
@@ -122,11 +136,10 @@ export class AstraDBVectorStore implements VectorStore {
 
     for (const batch of batchData) {
       console.debug(`Inserting batch of size ${batch.length}`);
-
-      const result = await collection.insertMany(batch);
+      await collection.insertMany(batch);
     }
 
-    return dataToInsert.map((node) => node._id);
+    return dataToInsert.map((node) => node?.[this.idKey] as string);
   }
 
   /**
@@ -185,27 +198,24 @@ export class AstraDBVectorStore implements VectorStore {
     const similarities: number[] = [];
 
     await cursor.forEach(async (row: Record<string, any>) => {
-      const id = row[this.idKey];
-      const embedding = row.$vector;
-      const similarity = row.$similarity;
-      const metadata = row[this.metadataKey];
-
-      // Remove fields from content
-      delete row[this.idKey];
-      delete row.$similarity;
-      delete row.$vector;
-      delete row[this.metadataKey];
-
-      const content = this.contentKey
-        ? row[this.contentKey]
-        : JSON.stringify(row);
-
-      const node = new Document({
-        id_: id,
-        text: content,
-        metadata: metadata ?? {},
-        embedding: embedding,
+      const {
+        $vector: embedding,
+        $similarity: similarity,
+        [this.idKey]: id,
+        [this.contentKey]: content,
+        [this.metadataKey]: metadata = {},
+        ...rest
+      } = row;
+
+      const node = metadataDictToNode(metadata, {
+        fallback: {
+          id,
+          text: content,
+          metadata,
+          ...rest,
+        },
       });
+      node.setContent(content);
 
       ids.push(id);
       similarities.push(similarity);
diff --git a/packages/core/src/storage/vectorStore/utils.ts b/packages/core/src/storage/vectorStore/utils.ts
index a20dbf7e6..f0c2a512b 100644
--- a/packages/core/src/storage/vectorStore/utils.ts
+++ b/packages/core/src/storage/vectorStore/utils.ts
@@ -36,7 +36,16 @@ export function nodeToMetadata(
   return metadata;
 }
 
-export function metadataDictToNode(metadata: Metadata): BaseNode {
+type MetadataDictToNodeOptions = {
+  // If the metadata doesn't contain node content, use this object as a fallback, for usage see
+  // AstraDBVectorStore.ts
+  fallback: Record<string, any>;
+};
+
+export function metadataDictToNode(
+  metadata: Metadata,
+  options?: MetadataDictToNodeOptions,
+): BaseNode {
   const {
     _node_content: nodeContent,
     _node_type: nodeType,
@@ -45,11 +54,17 @@ export function metadataDictToNode(metadata: Metadata): BaseNode {
     ref_doc_id,
     ...rest
   } = metadata;
+  let nodeObj;
   if (!nodeContent) {
-    throw new Error("Node content not found in metadata.");
+    if (options?.fallback) {
+      nodeObj = options?.fallback;
+    } else {
+      throw new Error("Node content not found in metadata.");
+    }
+  } else {
+    nodeObj = JSON.parse(nodeContent);
+    nodeObj.metadata = rest;
   }
-  const nodeObj = JSON.parse(nodeContent);
-  nodeObj.metadata = rest;
 
   // Note: we're using the name of the class stored in `_node_type`
   // and not the type attribute to reconstruct
diff --git a/packages/eslint-config-custom/index.js b/packages/eslint-config-custom/index.js
index ff53536b5..db2ba0f50 100644
--- a/packages/eslint-config-custom/index.js
+++ b/packages/eslint-config-custom/index.js
@@ -14,6 +14,7 @@ module.exports = {
 
           "ASTRA_DB_APPLICATION_TOKEN",
           "ASTRA_DB_ENDPOINT",
+          "ASTRA_DB_NAMESPACE",
 
           "AZURE_OPENAI_KEY",
           "AZURE_OPENAI_ENDPOINT",
-- 
GitLab