diff --git a/.changeset/gorgeous-bees-hide.md b/.changeset/gorgeous-bees-hide.md new file mode 100644 index 0000000000000000000000000000000000000000..4b0c5dfd0837294a7bcca81418948fec23d586e5 --- /dev/null +++ b/.changeset/gorgeous-bees-hide.md @@ -0,0 +1,7 @@ +--- +"llamaindex": patch +--- + +fix: add `serializer` in doc store + +`PostgresDocumentStore` now will not use JSON.stringify for better performance diff --git a/packages/llamaindex/src/ingestion/IngestionCache.ts b/packages/llamaindex/src/ingestion/IngestionCache.ts index 353e565f85a6533984400e0138c20a4e403c5594..4d6352f8fa40fc25c5e4cb468d2e88f62a206e42 100644 --- a/packages/llamaindex/src/ingestion/IngestionCache.ts +++ b/packages/llamaindex/src/ingestion/IngestionCache.ts @@ -1,7 +1,11 @@ import type { BaseNode, TransformComponent } from "@llamaindex/core/schema"; import { MetadataMode } from "@llamaindex/core/schema"; import { createSHA256 } from "@llamaindex/env"; -import { docToJson, jsonToDoc } from "../storage/docStore/utils.js"; +import { + docToJson, + jsonSerializer, + jsonToDoc, +} from "../storage/docStore/utils.js"; import { SimpleKVStore } from "../storage/kvStore/SimpleKVStore.js"; import type { BaseKVStore } from "../storage/kvStore/types.js"; @@ -53,7 +57,7 @@ export class IngestionCache { async put(hash: string, nodes: BaseNode[]) { const val = { - [this.nodesKey]: nodes.map((node) => docToJson(node)), + [this.nodesKey]: nodes.map((node) => docToJson(node, jsonSerializer)), }; await this.cache.put(hash, val, this.collection); } @@ -63,6 +67,8 @@ export class IngestionCache { if (!json || !json[this.nodesKey] || !Array.isArray(json[this.nodesKey])) { return undefined; } - return json[this.nodesKey].map((doc: any) => jsonToDoc(doc)); + return json[this.nodesKey].map((doc: any) => + jsonToDoc(doc, jsonSerializer), + ); } } diff --git a/packages/llamaindex/src/storage/docStore/KVDocumentStore.ts b/packages/llamaindex/src/storage/docStore/KVDocumentStore.ts index 223969b61cc09b4a9147047b0d95723a57fcadcc..d302594d414e0fc9597780eacfc8409ddacce64f 100644 --- a/packages/llamaindex/src/storage/docStore/KVDocumentStore.ts +++ b/packages/llamaindex/src/storage/docStore/KVDocumentStore.ts @@ -29,7 +29,7 @@ export class KVDocumentStore extends BaseDocumentStore { for (const key in jsonDict) { const value = jsonDict[key]; if (isValidDocJson(value)) { - docs[key] = jsonToDoc(value); + docs[key] = jsonToDoc(value, this.serializer); } else { console.warn(`Invalid JSON for docId ${key}`); } @@ -52,7 +52,7 @@ export class KVDocumentStore extends BaseDocumentStore { ); } const nodeKey = doc.id_; - const data = docToJson(doc); + const data = docToJson(doc, this.serializer); await this.kvstore.put(nodeKey, data, this.nodeCollection); const metadata: DocMetaData = { docHash: doc.hash }; @@ -94,7 +94,7 @@ export class KVDocumentStore extends BaseDocumentStore { if (!isValidDocJson(json)) { throw new Error(`Invalid JSON for docId ${docId}`); } - return jsonToDoc(json); + return jsonToDoc(json, this.serializer); } async getRefDocInfo(refDocId: string): Promise<RefDocInfo | undefined> { diff --git a/packages/llamaindex/src/storage/docStore/PostgresDocumentStore.ts b/packages/llamaindex/src/storage/docStore/PostgresDocumentStore.ts index 2a221d95339097762f835273efb4c77b5dcfd5ac..a910b4a2989ca96f5e6d40add00430106fd64e23 100644 --- a/packages/llamaindex/src/storage/docStore/PostgresDocumentStore.ts +++ b/packages/llamaindex/src/storage/docStore/PostgresDocumentStore.ts @@ -4,6 +4,7 @@ import { type PostgresKVStoreConfig, } from "../kvStore/PostgresKVStore.js"; import { KVDocumentStore } from "./KVDocumentStore.js"; +import { noneSerializer } from "./utils.js"; const DEFAULT_TABLE_NAME = "llamaindex_doc_store"; @@ -12,6 +13,8 @@ export type PostgresDocumentStoreConfig = PostgresKVStoreConfig & { }; export class PostgresDocumentStore extends KVDocumentStore { + serializer = noneSerializer; + constructor(config?: PostgresDocumentStoreConfig) { const kvStore = new PostgresKVStore({ schemaName: config?.schemaName, diff --git a/packages/llamaindex/src/storage/docStore/types.ts b/packages/llamaindex/src/storage/docStore/types.ts index 8125322e5d921f4950cb77f039481d5c68ddc905..b580d5e17484bd4ca1b20b2279969c34eefc4346 100644 --- a/packages/llamaindex/src/storage/docStore/types.ts +++ b/packages/llamaindex/src/storage/docStore/types.ts @@ -3,6 +3,7 @@ import { DEFAULT_PERSIST_DIR, } from "@llamaindex/core/global"; import { BaseNode } from "@llamaindex/core/schema"; +import { jsonSerializer, type Serializer } from "./utils.js"; const defaultPersistPath = `${DEFAULT_PERSIST_DIR}/${DEFAULT_DOC_STORE_PERSIST_FILENAME}`; @@ -12,6 +13,8 @@ export interface RefDocInfo { } export abstract class BaseDocumentStore { + serializer: Serializer<any> = jsonSerializer; + // Save/load persist(persistPath: string = defaultPersistPath): void { // Persist the docstore to a file. diff --git a/packages/llamaindex/src/storage/docStore/utils.ts b/packages/llamaindex/src/storage/docStore/utils.ts index 1735b81eeafab87a6ac6fea4a0fb9a0fa01e35c9..68596a0b12f5b76b4f48d9b4e059bc59644c5a94 100644 --- a/packages/llamaindex/src/storage/docStore/utils.ts +++ b/packages/llamaindex/src/storage/docStore/utils.ts @@ -4,12 +4,35 @@ import { Document, ObjectType, TextNode } from "@llamaindex/core/schema"; const TYPE_KEY = "__type__"; const DATA_KEY = "__data__"; -type DocJson = { +export interface Serializer<T> { + toPersistence(data: Record<string, unknown>): T; + fromPersistence(data: T): Record<string, unknown>; +} + +export const jsonSerializer: Serializer<string> = { + toPersistence(data) { + return JSON.stringify(data); + }, + fromPersistence(data) { + return JSON.parse(data); + }, +}; + +export const noneSerializer: Serializer<Record<string, unknown>> = { + toPersistence(data) { + return data; + }, + fromPersistence(data) { + return data; + }, +}; + +type DocJson<Data> = { [TYPE_KEY]: ObjectType; - [DATA_KEY]: string; + [DATA_KEY]: Data; }; -export function isValidDocJson(docJson: any): docJson is DocJson { +export function isValidDocJson(docJson: any): docJson is DocJson<unknown> { return ( typeof docJson === "object" && docJson !== null && @@ -18,16 +41,22 @@ export function isValidDocJson(docJson: any): docJson is DocJson { ); } -export function docToJson(doc: BaseNode): DocJson { +export function docToJson( + doc: BaseNode, + serializer: Serializer<unknown>, +): DocJson<unknown> { return { - [DATA_KEY]: JSON.stringify(doc.toJSON()), + [DATA_KEY]: serializer.toPersistence(doc.toJSON()), [TYPE_KEY]: doc.type, }; } -export function jsonToDoc(docDict: DocJson): BaseNode { +export function jsonToDoc<Data>( + docDict: DocJson<Data>, + serializer: Serializer<Data>, +): BaseNode { const docType = docDict[TYPE_KEY]; - const dataDict = JSON.parse(docDict[DATA_KEY]); + const dataDict = serializer.fromPersistence(docDict[DATA_KEY]) as any; let doc: BaseNode; if (docType === ObjectType.DOCUMENT) {