From 23bcc379a8f90ec7e921122d46bb6189173cd461 Mon Sep 17 00:00:00 2001 From: Goran <gospaso@gmail.com> Date: Mon, 23 Sep 2024 22:11:51 +0200 Subject: [PATCH] fix: add `serializer` in doc store (#1243) Co-authored-by: Alex Yang <himself65@outlook.com> --- .changeset/gorgeous-bees-hide.md | 7 +++ .../src/ingestion/IngestionCache.ts | 12 ++++-- .../src/storage/docStore/KVDocumentStore.ts | 6 +-- .../storage/docStore/PostgresDocumentStore.ts | 3 ++ .../llamaindex/src/storage/docStore/types.ts | 3 ++ .../llamaindex/src/storage/docStore/utils.ts | 43 ++++++++++++++++--- 6 files changed, 61 insertions(+), 13 deletions(-) create mode 100644 .changeset/gorgeous-bees-hide.md diff --git a/.changeset/gorgeous-bees-hide.md b/.changeset/gorgeous-bees-hide.md new file mode 100644 index 000000000..4b0c5dfd0 --- /dev/null +++ b/.changeset/gorgeous-bees-hide.md @@ -0,0 +1,7 @@ +--- +"llamaindex": patch +--- + +fix: add `serializer` in doc store + +`PostgresDocumentStore` now will not use JSON.stringify for better performance diff --git a/packages/llamaindex/src/ingestion/IngestionCache.ts b/packages/llamaindex/src/ingestion/IngestionCache.ts index 353e565f8..4d6352f8f 100644 --- a/packages/llamaindex/src/ingestion/IngestionCache.ts +++ b/packages/llamaindex/src/ingestion/IngestionCache.ts @@ -1,7 +1,11 @@ import type { BaseNode, TransformComponent } from "@llamaindex/core/schema"; import { MetadataMode } from "@llamaindex/core/schema"; import { createSHA256 } from "@llamaindex/env"; -import { docToJson, jsonToDoc } from "../storage/docStore/utils.js"; +import { + docToJson, + jsonSerializer, + jsonToDoc, +} from "../storage/docStore/utils.js"; import { SimpleKVStore } from "../storage/kvStore/SimpleKVStore.js"; import type { BaseKVStore } from "../storage/kvStore/types.js"; @@ -53,7 +57,7 @@ export class IngestionCache { async put(hash: string, nodes: BaseNode[]) { const val = { - [this.nodesKey]: nodes.map((node) => docToJson(node)), + [this.nodesKey]: nodes.map((node) => docToJson(node, jsonSerializer)), }; await this.cache.put(hash, val, this.collection); } @@ -63,6 +67,8 @@ export class IngestionCache { if (!json || !json[this.nodesKey] || !Array.isArray(json[this.nodesKey])) { return undefined; } - return json[this.nodesKey].map((doc: any) => jsonToDoc(doc)); + return json[this.nodesKey].map((doc: any) => + jsonToDoc(doc, jsonSerializer), + ); } } diff --git a/packages/llamaindex/src/storage/docStore/KVDocumentStore.ts b/packages/llamaindex/src/storage/docStore/KVDocumentStore.ts index 223969b61..d302594d4 100644 --- a/packages/llamaindex/src/storage/docStore/KVDocumentStore.ts +++ b/packages/llamaindex/src/storage/docStore/KVDocumentStore.ts @@ -29,7 +29,7 @@ export class KVDocumentStore extends BaseDocumentStore { for (const key in jsonDict) { const value = jsonDict[key]; if (isValidDocJson(value)) { - docs[key] = jsonToDoc(value); + docs[key] = jsonToDoc(value, this.serializer); } else { console.warn(`Invalid JSON for docId ${key}`); } @@ -52,7 +52,7 @@ export class KVDocumentStore extends BaseDocumentStore { ); } const nodeKey = doc.id_; - const data = docToJson(doc); + const data = docToJson(doc, this.serializer); await this.kvstore.put(nodeKey, data, this.nodeCollection); const metadata: DocMetaData = { docHash: doc.hash }; @@ -94,7 +94,7 @@ export class KVDocumentStore extends BaseDocumentStore { if (!isValidDocJson(json)) { throw new Error(`Invalid JSON for docId ${docId}`); } - return jsonToDoc(json); + return jsonToDoc(json, this.serializer); } async getRefDocInfo(refDocId: string): Promise<RefDocInfo | undefined> { diff --git a/packages/llamaindex/src/storage/docStore/PostgresDocumentStore.ts b/packages/llamaindex/src/storage/docStore/PostgresDocumentStore.ts index 2a221d953..a910b4a29 100644 --- a/packages/llamaindex/src/storage/docStore/PostgresDocumentStore.ts +++ b/packages/llamaindex/src/storage/docStore/PostgresDocumentStore.ts @@ -4,6 +4,7 @@ import { type PostgresKVStoreConfig, } from "../kvStore/PostgresKVStore.js"; import { KVDocumentStore } from "./KVDocumentStore.js"; +import { noneSerializer } from "./utils.js"; const DEFAULT_TABLE_NAME = "llamaindex_doc_store"; @@ -12,6 +13,8 @@ export type PostgresDocumentStoreConfig = PostgresKVStoreConfig & { }; export class PostgresDocumentStore extends KVDocumentStore { + serializer = noneSerializer; + constructor(config?: PostgresDocumentStoreConfig) { const kvStore = new PostgresKVStore({ schemaName: config?.schemaName, diff --git a/packages/llamaindex/src/storage/docStore/types.ts b/packages/llamaindex/src/storage/docStore/types.ts index 8125322e5..b580d5e17 100644 --- a/packages/llamaindex/src/storage/docStore/types.ts +++ b/packages/llamaindex/src/storage/docStore/types.ts @@ -3,6 +3,7 @@ import { DEFAULT_PERSIST_DIR, } from "@llamaindex/core/global"; import { BaseNode } from "@llamaindex/core/schema"; +import { jsonSerializer, type Serializer } from "./utils.js"; const defaultPersistPath = `${DEFAULT_PERSIST_DIR}/${DEFAULT_DOC_STORE_PERSIST_FILENAME}`; @@ -12,6 +13,8 @@ export interface RefDocInfo { } export abstract class BaseDocumentStore { + serializer: Serializer<any> = jsonSerializer; + // Save/load persist(persistPath: string = defaultPersistPath): void { // Persist the docstore to a file. diff --git a/packages/llamaindex/src/storage/docStore/utils.ts b/packages/llamaindex/src/storage/docStore/utils.ts index 1735b81ee..68596a0b1 100644 --- a/packages/llamaindex/src/storage/docStore/utils.ts +++ b/packages/llamaindex/src/storage/docStore/utils.ts @@ -4,12 +4,35 @@ import { Document, ObjectType, TextNode } from "@llamaindex/core/schema"; const TYPE_KEY = "__type__"; const DATA_KEY = "__data__"; -type DocJson = { +export interface Serializer<T> { + toPersistence(data: Record<string, unknown>): T; + fromPersistence(data: T): Record<string, unknown>; +} + +export const jsonSerializer: Serializer<string> = { + toPersistence(data) { + return JSON.stringify(data); + }, + fromPersistence(data) { + return JSON.parse(data); + }, +}; + +export const noneSerializer: Serializer<Record<string, unknown>> = { + toPersistence(data) { + return data; + }, + fromPersistence(data) { + return data; + }, +}; + +type DocJson<Data> = { [TYPE_KEY]: ObjectType; - [DATA_KEY]: string; + [DATA_KEY]: Data; }; -export function isValidDocJson(docJson: any): docJson is DocJson { +export function isValidDocJson(docJson: any): docJson is DocJson<unknown> { return ( typeof docJson === "object" && docJson !== null && @@ -18,16 +41,22 @@ export function isValidDocJson(docJson: any): docJson is DocJson { ); } -export function docToJson(doc: BaseNode): DocJson { +export function docToJson( + doc: BaseNode, + serializer: Serializer<unknown>, +): DocJson<unknown> { return { - [DATA_KEY]: JSON.stringify(doc.toJSON()), + [DATA_KEY]: serializer.toPersistence(doc.toJSON()), [TYPE_KEY]: doc.type, }; } -export function jsonToDoc(docDict: DocJson): BaseNode { +export function jsonToDoc<Data>( + docDict: DocJson<Data>, + serializer: Serializer<Data>, +): BaseNode { const docType = docDict[TYPE_KEY]; - const dataDict = JSON.parse(docDict[DATA_KEY]); + const dataDict = serializer.fromPersistence(docDict[DATA_KEY]) as any; let doc: BaseNode; if (docType === ObjectType.DOCUMENT) { -- GitLab