From 5f6782038a538b9fad3ea156c4523d4adebc00c3 Mon Sep 17 00:00:00 2001 From: Marcus Schiesser <mail@marcusschiesser.de> Date: Wed, 9 Oct 2024 11:56:02 +0700 Subject: [PATCH] Fix that node parsers generate nodes with UUIDs (#1311) --- .changeset/healthy-tables-hug.md | 6 +++ packages/core/src/schema/node.ts | 4 +- packages/core/src/utils/index.ts | 1 - packages/core/src/utils/uuid.ts | 22 ----------- packages/core/tests/utils/uuid.test.ts | 37 ------------------- .../src/vector-store/QdrantVectorStore.ts | 3 +- .../tests/indices/VectorStoreIndex.test.ts | 4 +- 7 files changed, 11 insertions(+), 66 deletions(-) create mode 100644 .changeset/healthy-tables-hug.md delete mode 100644 packages/core/src/utils/uuid.ts delete mode 100644 packages/core/tests/utils/uuid.test.ts diff --git a/.changeset/healthy-tables-hug.md b/.changeset/healthy-tables-hug.md new file mode 100644 index 000000000..d44406786 --- /dev/null +++ b/.changeset/healthy-tables-hug.md @@ -0,0 +1,6 @@ +--- +"llamaindex": patch +"@llamaindex/core": patch +--- + +Fix that node parsers generate nodes with UUIDs diff --git a/packages/core/src/schema/node.ts b/packages/core/src/schema/node.ts index a79226db1..459dbfb66 100644 --- a/packages/core/src/schema/node.ts +++ b/packages/core/src/schema/node.ts @@ -479,7 +479,7 @@ export function buildNodeFromSplits( ) { const imageDoc = doc as ImageNode; const imageNode = new ImageNode({ - id_: imageDoc.id_ ?? idGenerator(i, imageDoc), + id_: idGenerator(i, imageDoc), text: textChunk, image: imageDoc.image, embedding: imageDoc.embedding, @@ -496,7 +496,7 @@ export function buildNodeFromSplits( ) { const textDoc = doc as TextNode; const node = new TextNode({ - id_: textDoc.id_ ?? idGenerator(i, textDoc), + id_: idGenerator(i, textDoc), text: textChunk, embedding: textDoc.embedding, excludedEmbedMetadataKeys: [...textDoc.excludedEmbedMetadataKeys], diff --git a/packages/core/src/utils/index.ts b/packages/core/src/utils/index.ts index 4c29807ce..d040f010a 100644 --- a/packages/core/src/utils/index.ts +++ b/packages/core/src/utils/index.ts @@ -80,4 +80,3 @@ export { } from "./llms"; export { objectEntries } from "./object-entries"; -export { UUIDFromString } from "./uuid"; diff --git a/packages/core/src/utils/uuid.ts b/packages/core/src/utils/uuid.ts deleted file mode 100644 index a01fb7a60..000000000 --- a/packages/core/src/utils/uuid.ts +++ /dev/null @@ -1,22 +0,0 @@ -import { createSHA256 } from "@llamaindex/env"; - -export function UUIDFromString(input: string) { - const hashFunction = createSHA256(); - hashFunction.update(input); - const base64Hash = hashFunction.digest(); - - // Convert base64 to hex - const hexHash = Buffer.from(base64Hash, "base64").toString("hex"); - - // Format the hash to resemble a UUID (version 5 style) - const uuid = [ - hexHash.substring(0, 8), - hexHash.substring(8, 12), - "5" + hexHash.substring(12, 15), // Set the version to 5 (name-based) - ((parseInt(hexHash.substring(15, 17), 16) & 0x3f) | 0x80).toString(16) + - hexHash.substring(17, 19), // Set the variant - hexHash.substring(19, 31), - ].join("-"); - - return uuid; -} diff --git a/packages/core/tests/utils/uuid.test.ts b/packages/core/tests/utils/uuid.test.ts deleted file mode 100644 index dc1d6055f..000000000 --- a/packages/core/tests/utils/uuid.test.ts +++ /dev/null @@ -1,37 +0,0 @@ -import { UUIDFromString } from "@llamaindex/core/utils"; -import { describe, expect, it } from "vitest"; - -const UUID_REGEX = - /^[0-9a-f]{8}-[0-9a-f]{4}-5[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/i; - -describe("UUIDFromString", () => { - it("should convert string to UUID", () => { - const string = "document_id_1"; - const result = UUIDFromString(string); - expect(result).toBeDefined(); - expect(result).toMatch(UUID_REGEX); - }); - - it("should return the same UUID for the same input string", () => { - const string = "document_id_1"; - const result1 = UUIDFromString(string); - const result2 = UUIDFromString(string); - expect(result1).toEqual(result2); - }); - - it("should return the different UUID for different input strings", () => { - const string1 = "document_id_1"; - const string2 = "document_id_2"; - const result1 = UUIDFromString(string1); - const result2 = UUIDFromString(string2); - expect(result1).not.toEqual(result2); - }); - - it("should handle case-sensitive input strings", () => { - const string1 = "document_id_1"; - const string2 = "Document_Id_1"; - const result1 = UUIDFromString(string1); - const result2 = UUIDFromString(string2); - expect(result1).not.toEqual(result2); - }); -}); diff --git a/packages/llamaindex/src/vector-store/QdrantVectorStore.ts b/packages/llamaindex/src/vector-store/QdrantVectorStore.ts index 2fbb05064..4204ae619 100644 --- a/packages/llamaindex/src/vector-store/QdrantVectorStore.ts +++ b/packages/llamaindex/src/vector-store/QdrantVectorStore.ts @@ -10,7 +10,6 @@ import { type VectorStoreQueryResult, } from "./types.js"; -import { UUIDFromString } from "@llamaindex/core/utils"; import type { QdrantClientParams, Schemas } from "@qdrant/js-client-rest"; import { QdrantClient } from "@qdrant/js-client-rest"; import { metadataDictToNode, nodeToMetadata } from "./utils.js"; @@ -171,7 +170,7 @@ export class QdrantVectorStore for (let k = 0; k < nodeIds.length; k++) { const point: PointStruct = { - id: UUIDFromString(nodeIds[k]!.id_), + id: nodeIds[k]!.id_, payload: payloads[k]!, vector: vectors[k]!, }; diff --git a/packages/llamaindex/tests/indices/VectorStoreIndex.test.ts b/packages/llamaindex/tests/indices/VectorStoreIndex.test.ts index a807f8bed..9bca51af7 100644 --- a/packages/llamaindex/tests/indices/VectorStoreIndex.test.ts +++ b/packages/llamaindex/tests/indices/VectorStoreIndex.test.ts @@ -27,7 +27,7 @@ describe("VectorStoreIndex", () => { runs: number = 2, ): Promise<Array<number>> => { const documents = [new Document({ text: "lorem ipsem", id_: "1" })]; - const entries: number[] = []; + const entries = []; for (let i = 0; i < runs; i++) { await VectorStoreIndex.fromDocuments(documents, { serviceContext, @@ -43,7 +43,7 @@ describe("VectorStoreIndex", () => { test("fromDocuments stores duplicates without a doc store strategy", async () => { const entries = await testStrategy(DocStoreStrategy.NONE); - expect(entries[0]).toBe(entries[1]); + expect(entries[0]! + 1).toBe(entries[1]); }); test("fromDocuments ignores duplicates with upserts doc store strategy", async () => { -- GitLab