Skip to content
Snippets Groups Projects
Unverified Commit ca60ba82 authored by Hakeem Abbas's avatar Hakeem Abbas Committed by GitHub
Browse files

fix: sanitizeNamespace (#3246)


bug fixes for sanitizing Namespaces and handling chunk size limit of astradb collections in each doc

Co-authored-by: default avatarTimothy Carambat <rambat1010@gmail.com>
parent e49ec05a
No related branches found
No related tags found
No related merge requests found
......@@ -6,6 +6,27 @@ const { v4: uuidv4 } = require("uuid");
const { toChunks, getEmbeddingEngineSelection } = require("../../helpers");
const { sourceIdentifier } = require("../../chats");
const sanitizeNamespace = (namespace) => {
// If namespace already starts with ns_, don't add it again
if (namespace.startsWith("ns_")) return namespace;
// Remove any invalid characters, ensure starts with letter
return `ns_${namespace.replace(/[^a-zA-Z0-9_]/g, "_")}`;
};
// Add this helper method to check if collection exists more reliably
const collectionExists = async function (client, namespace) {
try {
const collections = await AstraDB.allNamespaces(client);
if (collections) {
return collections.includes(namespace);
}
} catch (error) {
console.log("Astra::collectionExists check error", error?.message || error);
return false; // Return false for any error to allow creation attempt
}
};
const AstraDB = {
name: "AstraDB",
connect: async function () {
......@@ -49,7 +70,10 @@ const AstraDB = {
},
namespace: async function (client, namespace = null) {
if (!namespace) throw new Error("No namespace value provided.");
const collection = await client.collection(namespace).catch(() => null);
const sanitizedNamespace = sanitizeNamespace(namespace);
const collection = await client
.collection(sanitizedNamespace)
.catch(() => null);
if (!(await this.isRealCollection(collection))) return null;
const count = await collection.countDocuments().catch((e) => {
......@@ -70,32 +94,50 @@ const AstraDB = {
},
namespaceExists: async function (client, namespace = null) {
if (!namespace) throw new Error("No namespace value provided.");
const collection = await client.collection(namespace);
const sanitizedNamespace = sanitizeNamespace(namespace);
const collection = await client.collection(sanitizedNamespace);
return await this.isRealCollection(collection);
},
deleteVectorsInNamespace: async function (client, namespace = null) {
await client.dropCollection(namespace);
const sanitizedNamespace = sanitizeNamespace(namespace);
await client.dropCollection(sanitizedNamespace);
return true;
},
// AstraDB requires a dimension aspect for collection creation
// we pass this in from the first chunk to infer the dimensions like other
// providers do.
getOrCreateCollection: async function (client, namespace, dimensions = null) {
const isExists = await this.namespaceExists(client, namespace);
if (!isExists) {
if (!dimensions)
throw new Error(
`AstraDB:getOrCreateCollection Unable to infer vector dimension from input. Open an issue on GitHub for support.`
);
const sanitizedNamespace = sanitizeNamespace(namespace);
try {
const exists = await collectionExists(client, sanitizedNamespace);
await client.createCollection(namespace, {
vector: {
dimension: dimensions,
metric: "cosine",
},
});
if (!exists) {
if (!dimensions) {
throw new Error(
`AstraDB:getOrCreateCollection Unable to infer vector dimension from input. Open an issue on Github for support.`
);
}
// Create new collection
await client.createCollection(sanitizedNamespace, {
vector: {
dimension: dimensions,
metric: "cosine",
},
});
// Get the newly created collection
return await client.collection(sanitizedNamespace);
}
return await client.collection(sanitizedNamespace);
} catch (error) {
console.error(
"Astra::getOrCreateCollection error",
error?.message || error
);
throw error;
}
return await client.collection(namespace);
},
addDocumentToNamespace: async function (
namespace,
......@@ -150,11 +192,14 @@ const AstraDB = {
const EmbedderEngine = getEmbeddingEngineSelection();
const textSplitter = new TextSplitter({
chunkSize: TextSplitter.determineMaxChunkSize(
await SystemSettings.getValueOrFallback({
label: "text_splitter_chunk_size",
}),
EmbedderEngine?.embeddingMaxChunkLength
chunkSize: Math.min(
7500,
TextSplitter.determineMaxChunkSize(
await SystemSettings.getValueOrFallback({
label: "text_splitter_chunk_size",
}),
EmbedderEngine?.embeddingMaxChunkLength
)
),
chunkOverlap: await SystemSettings.getValueOrFallback(
{ label: "text_splitter_chunk_overlap" },
......@@ -227,6 +272,7 @@ const AstraDB = {
deleteDocumentFromNamespace: async function (namespace, docId) {
const { DocumentVectors } = require("../../../models/vectors");
const { client } = await this.connect();
namespace = sanitizeNamespace(namespace);
if (!(await this.namespaceExists(client, namespace)))
throw new Error(
"Invalid namespace - has it been collected and populated yet?"
......@@ -259,7 +305,10 @@ const AstraDB = {
throw new Error("Invalid request to performSimilaritySearch.");
const { client } = await this.connect();
if (!(await this.namespaceExists(client, namespace))) {
// Sanitize namespace before checking existence
const sanitizedNamespace = sanitizeNamespace(namespace);
if (!(await this.namespaceExists(client, sanitizedNamespace))) {
return {
contextTexts: [],
sources: [],
......@@ -271,7 +320,7 @@ const AstraDB = {
const queryVector = await LLMConnector.embedTextInput(input);
const { contextTexts, sourceDocuments } = await this.similarityResponse({
client,
namespace,
namespace: sanitizedNamespace,
queryVector,
similarityThreshold,
topN,
......@@ -300,8 +349,9 @@ const AstraDB = {
sourceDocuments: [],
scores: [],
};
const collection = await client.collection(namespace);
// Namespace should already be sanitized, but let's be defensive
const sanitizedNamespace = sanitizeNamespace(namespace);
const collection = await client.collection(sanitizedNamespace);
const responses = await collection
.find(
{},
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment