diff --git a/server/utils/helpers/index.js b/server/utils/helpers/index.js index 43cc63767c41120048e87b395ff81ce1915b855c..745fd06153df18778d7fb960a96e56991b370acc 100644 --- a/server/utils/helpers/index.js +++ b/server/utils/helpers/index.js @@ -26,14 +26,24 @@ function curateSources(sources = []) { const knownDocs = []; const documents = []; + // Sometimes the source may or may not have a metadata property + // in the response so we search for it explicitly or just spread the entire + // source and check to see if at least title exists. for (const source of sources) { - const { metadata = {} } = source; - if ( - Object.keys(metadata).length > 0 && - !knownDocs.includes(metadata.title) - ) { - documents.push({ ...metadata }); - knownDocs.push(metadata.title); + if (source.hasOwnProperty("metadata")) { + const { metadata = {} } = source; + if ( + Object.keys(metadata).length > 0 && + !knownDocs.includes(metadata.title) + ) { + documents.push({ ...metadata }); + knownDocs.push(metadata.title); + } + } else { + if (Object.keys(source).length > 0 && !knownDocs.includes(source.title)) { + documents.push({ ...source }); + knownDocs.push(source.title); + } } } diff --git a/server/utils/vectorDbProviders/chroma/index.js b/server/utils/vectorDbProviders/chroma/index.js index ddeb3afe7613ae06870ea76c7d13149db09cbc31..863b6f1304b4a8a6335459b5c64386e7e8f95c5a 100644 --- a/server/utils/vectorDbProviders/chroma/index.js +++ b/server/utils/vectorDbProviders/chroma/index.js @@ -80,15 +80,20 @@ const Chroma = { temperature, }); }, - embedChunk: async function (openai, textChunk) { + embedTextInput: async function (openai, textInput) { + const result = await this.embedChunks(openai, textInput); + return result?.[0] || []; + }, + embedChunks: async function (openai, chunks = []) { const { data: { data }, } = await openai.createEmbedding({ model: "text-embedding-ada-002", - input: textChunk, + input: chunks, }); - return data.length > 0 && data[0].hasOwnProperty("embedding") - ? data[0].embedding + return data.length > 0 && + data.every((embd) => embd.hasOwnProperty("embedding")) + ? data.map((embd) => embd.embedding) : null; }, similarityResponse: async function (client, namespace, queryVector) { @@ -205,7 +210,7 @@ const Chroma = { const documentVectors = []; const vectors = []; const openai = this.openai(); - + const vectorValues = await this.embedChunks(openai, textChunks); const submission = { ids: [], embeddings: [], @@ -213,31 +218,29 @@ const Chroma = { documents: [], }; - for (const textChunk of textChunks) { - const vectorValues = await this.embedChunk(openai, textChunk); - - if (!!vectorValues) { + if (!!vectorValues && vectorValues.length > 0) { + for (const [i, vector] of vectorValues.entries()) { const vectorRecord = { id: uuidv4(), - values: vectorValues, + values: vector, // [DO NOT REMOVE] // LangChain will be unable to find your text if you embed manually and dont include the `text` key. // https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L64 - metadata: { ...metadata, text: textChunk }, + metadata: { ...metadata, text: textChunks[i] }, }; submission.ids.push(vectorRecord.id); submission.embeddings.push(vectorRecord.values); submission.metadatas.push(metadata); - submission.documents.push(textChunk); + submission.documents.push(textChunks[i]); vectors.push(vectorRecord); documentVectors.push({ docId, vectorId: vectorRecord.id }); - } else { - console.error( - "Could not use OpenAI to embed document chunk! This document will not be recorded." - ); } + } else { + console.error( + "Could not use OpenAI to embed document chunks! This document will not be recorded." + ); } const { client } = await this.connect(); @@ -340,7 +343,7 @@ const Chroma = { }; } - const queryVector = await this.embedChunk(this.openai(), input); + const queryVector = await this.embedTextInput(this.openai(), input); const { contextTexts, sourceDocuments } = await this.similarityResponse( client, namespace, diff --git a/server/utils/vectorDbProviders/lance/index.js b/server/utils/vectorDbProviders/lance/index.js index 21d962ffac2914f5df252f48df35ec0817be9225..a929d538bb6dc742094bf81c2a18d56ace192668 100644 --- a/server/utils/vectorDbProviders/lance/index.js +++ b/server/utils/vectorDbProviders/lance/index.js @@ -51,25 +51,30 @@ const LanceDb = { process.env.OPEN_AI_KEY ); }, - embedder: function () { - return new OpenAIEmbeddings({ openAIApiKey: process.env.OPEN_AI_KEY }); - }, - openai: function () { - const config = new Configuration({ apiKey: process.env.OPEN_AI_KEY }); - const openai = new OpenAIApi(config); - return openai; + embedTextInput: async function (openai, textInput) { + const result = await this.embedChunks(openai, textInput); + return result?.[0] || []; }, - embedChunk: async function (openai, textChunk) { + embedChunks: async function (openai, chunks = []) { const { data: { data }, } = await openai.createEmbedding({ model: "text-embedding-ada-002", - input: textChunk, + input: chunks, }); - return data.length > 0 && data[0].hasOwnProperty("embedding") - ? data[0].embedding + return data.length > 0 && + data.every((embd) => embd.hasOwnProperty("embedding")) + ? data.map((embd) => embd.embedding) : null; }, + embedder: function () { + return new OpenAIEmbeddings({ openAIApiKey: process.env.OPEN_AI_KEY }); + }, + openai: function () { + const config = new Configuration({ apiKey: process.env.OPEN_AI_KEY }); + const openai = new OpenAIApi(config); + return openai; + }, getChatCompletion: async function ( openai, messages = [], @@ -194,18 +199,17 @@ const LanceDb = { const vectors = []; const submissions = []; const openai = this.openai(); + const vectorValues = await this.embedChunks(openai, textChunks); - for (const textChunk of textChunks) { - const vectorValues = await this.embedChunk(openai, textChunk); - - if (!!vectorValues) { + if (!!vectorValues && vectorValues.length > 0) { + for (const [i, vector] of vectorValues.entries()) { const vectorRecord = { id: uuidv4(), - values: vectorValues, + values: vector, // [DO NOT REMOVE] // LangChain will be unable to find your text if you embed manually and dont include the `text` key. // https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L64 - metadata: { ...metadata, text: textChunk }, + metadata: { ...metadata, text: textChunks[i] }, }; vectors.push(vectorRecord); @@ -215,11 +219,11 @@ const LanceDb = { ...vectorRecord.metadata, }); documentVectors.push({ docId, vectorId: vectorRecord.id }); - } else { - console.error( - "Could not use OpenAI to embed document chunk! This document will not be recorded." - ); } + } else { + console.error( + "Could not use OpenAI to embed document chunks! This document will not be recorded." + ); } if (vectors.length > 0) { @@ -253,7 +257,7 @@ const LanceDb = { } // LanceDB does not have langchainJS support so we roll our own here. - const queryVector = await this.embedChunk(this.openai(), input); + const queryVector = await this.embedTextInput(this.openai(), input); const { contextTexts, sourceDocuments } = await this.similarityResponse( client, namespace, @@ -302,7 +306,7 @@ const LanceDb = { }; } - const queryVector = await this.embedChunk(this.openai(), input); + const queryVector = await this.embedTextInput(this.openai(), input); const { contextTexts, sourceDocuments } = await this.similarityResponse( client, namespace, diff --git a/server/utils/vectorDbProviders/pinecone/index.js b/server/utils/vectorDbProviders/pinecone/index.js index 67e4d1efb253d74705ea51a44ae7d6aa429e0a54..0c03e75bbb4ab6ebdaace1f4de193e560b0cf559 100644 --- a/server/utils/vectorDbProviders/pinecone/index.js +++ b/server/utils/vectorDbProviders/pinecone/index.js @@ -54,15 +54,20 @@ const Pinecone = { if (!data.hasOwnProperty("choices")) return null; return data.choices[0].message.content; }, - embedChunk: async function (openai, textChunk) { + embedTextInput: async function (openai, textInput) { + const result = await this.embedChunks(openai, textInput); + return result?.[0] || []; + }, + embedChunks: async function (openai, chunks = []) { const { data: { data }, } = await openai.createEmbedding({ model: "text-embedding-ada-002", - input: textChunk, + input: chunks, }); - return data.length > 0 && data[0].hasOwnProperty("embedding") - ? data[0].embedding + return data.length > 0 && + data.every((embd) => embd.hasOwnProperty("embedding")) + ? data.map((embd) => embd.embedding) : null; }, llm: function ({ temperature = 0.7 }) { @@ -175,25 +180,26 @@ const Pinecone = { const documentVectors = []; const vectors = []; const openai = this.openai(); - for (const textChunk of textChunks) { - const vectorValues = await this.embedChunk(openai, textChunk); + const vectorValues = await this.embedChunks(openai, textChunks); - if (!!vectorValues) { + if (!!vectorValues && vectorValues.length > 0) { + for (const [i, vector] of vectorValues.entries()) { const vectorRecord = { id: uuidv4(), - values: vectorValues, + values: vector, // [DO NOT REMOVE] // LangChain will be unable to find your text if you embed manually and dont include the `text` key. // https://github.com/hwchase17/langchainjs/blob/2def486af734c0ca87285a48f1a04c057ab74bdf/langchain/src/vectorstores/pinecone.ts#L64 - metadata: { ...metadata, text: textChunk }, + metadata: { ...metadata, text: textChunks[i] }, }; + vectors.push(vectorRecord); documentVectors.push({ docId, vectorId: vectorRecord.id }); - } else { - console.error( - "Could not use OpenAI to embed document chunk! This document will not be recorded." - ); } + } else { + console.error( + "Could not use OpenAI to embed document chunks! This document will not be recorded." + ); } if (vectors.length > 0) { @@ -311,7 +317,7 @@ const Pinecone = { "Invalid namespace - has it been collected and seeded yet?" ); - const queryVector = await this.embedChunk(this.openai(), input); + const queryVector = await this.embedTextInput(this.openai(), input); const { contextTexts, sourceDocuments } = await this.similarityResponse( pineconeIndex, namespace,