From 95602c795964190a3c17b9ab5208e71852da4ef5 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Tue, 23 Apr 2024 11:56:37 +0700 Subject: [PATCH] feat: overide generate hash function for image document (#751) Co-authored-by: Marcus Schiesser <mail@marcusschiesser.de> --- examples/multimodal/load.ts | 2 -- packages/core/src/Node.ts | 31 ++++++++++++++++++++++++++++ packages/core/tests/Document.test.ts | 14 ++++++++++++- 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/examples/multimodal/load.ts b/examples/multimodal/load.ts index 15c845b8f..3ed94e30b 100644 --- a/examples/multimodal/load.ts +++ b/examples/multimodal/load.ts @@ -4,7 +4,6 @@ import { VectorStoreIndex, storageContextFromDefaults, } from "llamaindex"; -import { DocStoreStrategy } from "llamaindex/ingestion/strategies/index"; import * as path from "path"; @@ -32,7 +31,6 @@ async function generateDatasource() { }); await VectorStoreIndex.fromDocuments(documents, { storageContext, - docStoreStrategy: DocStoreStrategy.NONE, }); }); console.log(`Storage successfully generated in ${ms / 1000}s.`); diff --git a/packages/core/src/Node.ts b/packages/core/src/Node.ts index bb7e43c69..8d8d045ab 100644 --- a/packages/core/src/Node.ts +++ b/packages/core/src/Node.ts @@ -326,6 +326,37 @@ export class ImageNode<T extends Metadata = Metadata> extends TextNode<T> { const absPath = path.resolve(this.id_); return new URL(`file://${absPath}`); } + + // Calculates the image part of the hash + private generateImageHash() { + const hashFunction = createSHA256(); + + if (this.image instanceof Blob) { + // TODO: ideally we should use the blob's content to calculate the hash: + // hashFunction.update(new Uint8Array(await this.image.arrayBuffer())); + // as this is async, we're using the node's ID for the time being + hashFunction.update(this.id_); + } else if (this.image instanceof URL) { + hashFunction.update(this.image.toString()); + } else if (typeof this.image === "string") { + hashFunction.update(this.image); + } else { + throw new Error( + `Unknown image type: ${typeof this.image}. Can't calculate hash`, + ); + } + + return hashFunction.digest(); + } + + generateHash() { + const hashFunction = createSHA256(); + // calculates hash based on hash of both components (image and text) + hashFunction.update(super.generateHash()); + hashFunction.update(this.generateImageHash()); + + return hashFunction.digest(); + } } export class ImageDocument<T extends Metadata = Metadata> extends ImageNode<T> { diff --git a/packages/core/tests/Document.test.ts b/packages/core/tests/Document.test.ts index 73da731df..d4a6704a9 100644 --- a/packages/core/tests/Document.test.ts +++ b/packages/core/tests/Document.test.ts @@ -1,4 +1,4 @@ -import { Document } from "llamaindex/Node"; +import { Document, ImageDocument } from "llamaindex/Node"; import { describe, expect, test } from "vitest"; describe("Document", () => { @@ -6,4 +6,16 @@ describe("Document", () => { const doc = new Document({ text: "text", id_: "docId" }); expect(doc).toBeDefined(); }); + + test("should generate different hash for different image contents", () => { + const imageNode1 = new ImageDocument({ + id_: "image", + image: "data:image/png;base64,sample_image_content1", + }); + const imageNode2 = new ImageDocument({ + id_: "image", + image: "data:image/png;base64,sample_image_content2", + }); + expect(imageNode1.hash).not.toBe(imageNode2.hash); + }); }); -- GitLab