diff --git a/examples/data/multi_modal/1.jpg b/examples/data/multi_modal/1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..169024b540c591fa85e0d1c24c581dca6f8255b1 Binary files /dev/null and b/examples/data/multi_modal/1.jpg differ diff --git a/examples/data/multi_modal/2.jpg b/examples/data/multi_modal/2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0a41cb1c510102bf3b8610275716bb7adac44581 Binary files /dev/null and b/examples/data/multi_modal/2.jpg differ diff --git a/examples/data/multi_modal/3.jpg b/examples/data/multi_modal/3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c58d5a1fb48e4e8809804a37263bafa87c9494dc Binary files /dev/null and b/examples/data/multi_modal/3.jpg differ diff --git a/examples/data/multi_modal/60.jpg b/examples/data/multi_modal/60.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5cbdea93a792635df763dd778ec36ce9ae5cfc68 Binary files /dev/null and b/examples/data/multi_modal/60.jpg differ diff --git a/examples/data/multi_modal/61.jpg b/examples/data/multi_modal/61.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d5c9edd53b61cc2b64d53cc96ab2844e34fe4831 Binary files /dev/null and b/examples/data/multi_modal/61.jpg differ diff --git a/examples/data/multi_modal/62.jpg b/examples/data/multi_modal/62.jpg new file mode 100644 index 0000000000000000000000000000000000000000..bbb7b2199c7806ceb51ea382da77157ac7f86dc5 Binary files /dev/null and b/examples/data/multi_modal/62.jpg differ diff --git a/examples/data/multi_modal/San Francisco.txt b/examples/data/multi_modal/San Francisco.txt new file mode 100644 index 0000000000000000000000000000000000000000..938f45d2bc883b2f267b1dfed163db65e8e52f7c Binary files /dev/null and b/examples/data/multi_modal/San Francisco.txt differ diff --git a/examples/data/multi_modal/Vincent van Gogh.txt b/examples/data/multi_modal/Vincent van Gogh.txt new file mode 100644 index 0000000000000000000000000000000000000000..30b127be017a095afff17a95a5f1f93dfa917968 Binary files /dev/null and b/examples/data/multi_modal/Vincent van Gogh.txt differ diff --git a/examples/multiModal.ts b/examples/multiModal.ts index 941b54b065580aa0d23329be677671ee7c4df04e..8a31980d550e30cd0c1ae90dca2ad7bda76d30b0 100644 --- a/examples/multiModal.ts +++ b/examples/multiModal.ts @@ -1,9 +1,12 @@ import { + ImageNode, serviceContextFromDefaults, SimpleDirectoryReader, SimpleVectorStore, + TextNode, VectorStoreIndex, } from "llamaindex"; +import * as path from "path"; async function main() { // read data into documents @@ -28,7 +31,17 @@ async function main() { "what are Vincent van Gogh's famous paintings", ); for (const result of results) { - console.log(result.node); + const node = result.node; + if (!node) { + continue; + } + if (node instanceof ImageNode) { + console.log(`Image: ${path.join(__dirname, node.id_)}`); + } else if (node instanceof TextNode) { + console.log("Text:", (node as TextNode).text.substring(0, 128)); + } + console.log(`ID: ${node.id_}`); + console.log(`Similarity: ${result.score}`); } } diff --git a/packages/core/src/Node.ts b/packages/core/src/Node.ts index ede6d92eeb104f4fbf7d46f3acd917bbcbec0e21..67ed91a1c16220a732b7e2fee1a6514a8ab46b9f 100644 --- a/packages/core/src/Node.ts +++ b/packages/core/src/Node.ts @@ -14,6 +14,7 @@ export enum ObjectType { IMAGE = "IMAGE", INDEX = "INDEX", DOCUMENT = "DOCUMENT", + IMAGE_DOCUMENT = "IMAGE_DOCUMENT", } export enum MetadataMode { @@ -229,17 +230,6 @@ export class TextNode<T extends Metadata = Metadata> extends BaseNode<T> { } } -export type ImageType = string | Blob | URL; - -export class ImageNode<T extends Metadata = Metadata> extends TextNode<T> { - image?: ImageType; // image as blob - textEmbedding?: number[]; // Assuming text embedding is an array of numbers - - getType(): ObjectType { - return ObjectType.IMAGE; - } -} - export class IndexNode<T extends Metadata = Metadata> extends TextNode<T> { indexId: string = ""; @@ -288,15 +278,37 @@ export function jsonToNode(json: any, type?: ObjectType) { return new IndexNode(json); case ObjectType.DOCUMENT: return new Document(json); + case ObjectType.IMAGE_DOCUMENT: + return new ImageDocument(json); default: throw new Error(`Invalid node type: ${nodeType}`); } } +export type ImageType = string | Blob | URL; + +export type ImageNodeConstructorProps<T extends Metadata> = Pick< + ImageNode<T>, + "image" | "id_" +> & + Partial<ImageNode<T>>; + +export class ImageNode<T extends Metadata = Metadata> extends TextNode<T> { + image: ImageType; // image as blob + + constructor(init: ImageNodeConstructorProps<T>) { + super(init); + this.image = init.image; + } + + getType(): ObjectType { + return ObjectType.IMAGE; + } +} + export class ImageDocument<T extends Metadata = Metadata> extends ImageNode<T> { - constructor(init?: Partial<ImageDocument<T>>) { + constructor(init: ImageNodeConstructorProps<T>) { super(init); - Object.assign(this, init); if (new.target === ImageDocument) { this.hash = this.generateHash(); @@ -304,7 +316,7 @@ export class ImageDocument<T extends Metadata = Metadata> extends ImageNode<T> { } getType() { - return ObjectType.DOCUMENT; + return ObjectType.IMAGE_DOCUMENT; } } diff --git a/packages/core/src/NodeParser.ts b/packages/core/src/NodeParser.ts index f3d064ba5738702f4a11ba12483d6eb8122735d1..d39aae5ae98fd5ef25103850f990439e55ed94b2 100644 --- a/packages/core/src/NodeParser.ts +++ b/packages/core/src/NodeParser.ts @@ -1,4 +1,10 @@ -import { Document, NodeRelationship, TextNode } from "./Node"; +import { + BaseNode, + Document, + ImageDocument, + NodeRelationship, + TextNode, +} from "./Node"; import { SentenceSplitter } from "./TextSplitter"; import { DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE } from "./constants"; @@ -27,12 +33,19 @@ export function getTextSplitsFromDocument( * @returns An array of nodes. */ export function getNodesFromDocument( - document: Document, + doc: BaseNode, textSplitter: SentenceSplitter, includeMetadata: boolean = true, includePrevNextRel: boolean = true, ) { - let nodes: TextNode[] = []; + if (doc instanceof ImageDocument) { + return [doc]; + } + if (!(doc instanceof Document)) { + throw new Error("Expected either an Image Document or Document"); + } + const document = doc as Document; + const nodes: TextNode[] = []; const textSplits = getTextSplitsFromDocument(document, textSplitter); @@ -62,7 +75,7 @@ export function getNodesFromDocument( } /** - * A NodeParser generates TextNodes from Documents + * A NodeParser generates Nodes from Documents */ export interface NodeParser { /** @@ -70,7 +83,7 @@ export interface NodeParser { * @param documents - The documents to generate nodes from. * @returns An array of nodes. */ - getNodesFromDocuments(documents: Document[]): TextNode[]; + getNodesFromDocuments(documents: BaseNode[]): BaseNode[]; } /** @@ -121,7 +134,7 @@ export class SimpleNodeParser implements NodeParser { * Generate Node objects from documents * @param documents */ - getNodesFromDocuments(documents: Document[]) { + getNodesFromDocuments(documents: BaseNode[]) { return documents .map((document) => getNodesFromDocument(document, this.textSplitter)) .flat(); diff --git a/packages/core/src/embeddings/MultiModalEmbedding.ts b/packages/core/src/embeddings/MultiModalEmbedding.ts index 43bb854a4c92a3af321d223026442bfb9082fd01..46d68ec25948db03c0137c35094acb72f6af557d 100644 --- a/packages/core/src/embeddings/MultiModalEmbedding.ts +++ b/packages/core/src/embeddings/MultiModalEmbedding.ts @@ -9,7 +9,6 @@ export abstract class MultiModalEmbedding extends BaseEmbedding { abstract getImageEmbedding(images: ImageType): Promise<number[]>; async getImageEmbeddings(images: ImageType[]): Promise<number[][]> { - // Embed the input sequence of images asynchronously. return Promise.all( images.map((imgFilePath) => this.getImageEmbedding(imgFilePath)), ); diff --git a/packages/core/src/indices/vectorStore/VectorIndexRetriever.ts b/packages/core/src/indices/vectorStore/VectorIndexRetriever.ts index fb9c8ee71e965a4cef9ef69b643491323fbf15a1..b24b732ccf17d06b2ba926d4cecd8ab9f8543d05 100644 --- a/packages/core/src/indices/vectorStore/VectorIndexRetriever.ts +++ b/packages/core/src/indices/vectorStore/VectorIndexRetriever.ts @@ -41,7 +41,7 @@ export class VectorIndexRetriever implements BaseRetriever { ): Promise<NodeWithScore[]> { let nodesWithScores = await this.textRetrieve(query, preFilters); nodesWithScores = nodesWithScores.concat( - await this.imageRetrieve(query, preFilters), + await this.textToImageRetrieve(query, preFilters), ); this.sendEvent(query, nodesWithScores, parentEvent); return nodesWithScores; @@ -56,7 +56,7 @@ export class VectorIndexRetriever implements BaseRetriever { return this.buildNodeListFromQueryResult(result); } - private async imageRetrieve(query: string, preFilters?: unknown) { + private async textToImageRetrieve(query: string, preFilters?: unknown) { if (!this.index.imageEmbedModel || !this.index.imageVectorStore) { // no-op if image embedding and vector store are not set return []; diff --git a/packages/core/src/indices/vectorStore/VectorStoreIndex.ts b/packages/core/src/indices/vectorStore/VectorStoreIndex.ts index 99c9101f31ebb4005f76a132b1742560f43f3a26..6721aaa862da17a9bc3019cec421b2f6c8218300 100644 --- a/packages/core/src/indices/vectorStore/VectorStoreIndex.ts +++ b/packages/core/src/indices/vectorStore/VectorStoreIndex.ts @@ -1,4 +1,3 @@ -import _ from "lodash"; import { BaseNode, Document, @@ -150,7 +149,6 @@ export class VectorStoreIndex extends BaseIndex<IndexDict> { /** * Get the embeddings for nodes. * @param nodes - * @param serviceContext * @param logProgress log progress to console (useful for debugging) * @returns */ @@ -348,11 +346,6 @@ export class VectorStoreIndex extends BaseIndex<IndexDict> { nodes: ImageNode[], logProgress: boolean = false, ): Promise<BaseNode[]> { - const isImageToText = nodes.every((node) => _.isString(node.text)); - if (isImageToText) { - // every image node has a text, use the text embedding model - return this.getNodeEmbeddingResults(nodes, logProgress); - } if (!this.imageEmbedModel) { return []; } @@ -364,9 +357,7 @@ export class VectorStoreIndex extends BaseIndex<IndexDict> { if (logProgress) { console.log(`getting embedding for node ${i}/${nodes.length}`); } - node.embedding = await this.imageEmbedModel.getImageEmbedding( - node.getContent(MetadataMode.EMBED), - ); + node.embedding = await this.imageEmbedModel.getImageEmbedding(node.image); nodesWithEmbeddings.push(node); } @@ -383,8 +374,7 @@ export class VectorStoreIndex extends BaseIndex<IndexDict> { for (let node of nodes) { if (node instanceof ImageNode) { imageNodes.push(node); - } - if (node instanceof TextNode) { + } else if (node instanceof TextNode) { textNodes.push(node); } } diff --git a/packages/core/src/readers/ImageReader.ts b/packages/core/src/readers/ImageReader.ts index be6ec431d69080503b183a4e1e568d379baee3fb..fd1b3969558b7076a3ceaad960eb07f9fe86f42e 100644 --- a/packages/core/src/readers/ImageReader.ts +++ b/packages/core/src/readers/ImageReader.ts @@ -18,7 +18,7 @@ export class ImageReader implements BaseReader { file: string, fs: GenericFileSystem = DEFAULT_FS, ): Promise<Document[]> { - const dataBuffer = await fs.readFile(file, "utf-8"); + const dataBuffer = await fs.readFile(file); const blob = new Blob([dataBuffer]); return [new ImageDocument({ image: blob, id_: file })]; }