From 23bcc379a8f90ec7e921122d46bb6189173cd461 Mon Sep 17 00:00:00 2001
From: Goran <gospaso@gmail.com>
Date: Mon, 23 Sep 2024 22:11:51 +0200
Subject: [PATCH] fix: add `serializer` in doc store (#1243)

Co-authored-by: Alex Yang <himself65@outlook.com>
---
 .changeset/gorgeous-bees-hide.md              |  7 +++
 .../src/ingestion/IngestionCache.ts           | 12 ++++--
 .../src/storage/docStore/KVDocumentStore.ts   |  6 +--
 .../storage/docStore/PostgresDocumentStore.ts |  3 ++
 .../llamaindex/src/storage/docStore/types.ts  |  3 ++
 .../llamaindex/src/storage/docStore/utils.ts  | 43 ++++++++++++++++---
 6 files changed, 61 insertions(+), 13 deletions(-)
 create mode 100644 .changeset/gorgeous-bees-hide.md

diff --git a/.changeset/gorgeous-bees-hide.md b/.changeset/gorgeous-bees-hide.md
new file mode 100644
index 000000000..4b0c5dfd0
--- /dev/null
+++ b/.changeset/gorgeous-bees-hide.md
@@ -0,0 +1,7 @@
+---
+"llamaindex": patch
+---
+
+fix: add `serializer` in doc store
+
+`PostgresDocumentStore` now will not use JSON.stringify for better performance
diff --git a/packages/llamaindex/src/ingestion/IngestionCache.ts b/packages/llamaindex/src/ingestion/IngestionCache.ts
index 353e565f8..4d6352f8f 100644
--- a/packages/llamaindex/src/ingestion/IngestionCache.ts
+++ b/packages/llamaindex/src/ingestion/IngestionCache.ts
@@ -1,7 +1,11 @@
 import type { BaseNode, TransformComponent } from "@llamaindex/core/schema";
 import { MetadataMode } from "@llamaindex/core/schema";
 import { createSHA256 } from "@llamaindex/env";
-import { docToJson, jsonToDoc } from "../storage/docStore/utils.js";
+import {
+  docToJson,
+  jsonSerializer,
+  jsonToDoc,
+} from "../storage/docStore/utils.js";
 import { SimpleKVStore } from "../storage/kvStore/SimpleKVStore.js";
 import type { BaseKVStore } from "../storage/kvStore/types.js";
 
@@ -53,7 +57,7 @@ export class IngestionCache {
 
   async put(hash: string, nodes: BaseNode[]) {
     const val = {
-      [this.nodesKey]: nodes.map((node) => docToJson(node)),
+      [this.nodesKey]: nodes.map((node) => docToJson(node, jsonSerializer)),
     };
     await this.cache.put(hash, val, this.collection);
   }
@@ -63,6 +67,8 @@ export class IngestionCache {
     if (!json || !json[this.nodesKey] || !Array.isArray(json[this.nodesKey])) {
       return undefined;
     }
-    return json[this.nodesKey].map((doc: any) => jsonToDoc(doc));
+    return json[this.nodesKey].map((doc: any) =>
+      jsonToDoc(doc, jsonSerializer),
+    );
   }
 }
diff --git a/packages/llamaindex/src/storage/docStore/KVDocumentStore.ts b/packages/llamaindex/src/storage/docStore/KVDocumentStore.ts
index 223969b61..d302594d4 100644
--- a/packages/llamaindex/src/storage/docStore/KVDocumentStore.ts
+++ b/packages/llamaindex/src/storage/docStore/KVDocumentStore.ts
@@ -29,7 +29,7 @@ export class KVDocumentStore extends BaseDocumentStore {
     for (const key in jsonDict) {
       const value = jsonDict[key];
       if (isValidDocJson(value)) {
-        docs[key] = jsonToDoc(value);
+        docs[key] = jsonToDoc(value, this.serializer);
       } else {
         console.warn(`Invalid JSON for docId ${key}`);
       }
@@ -52,7 +52,7 @@ export class KVDocumentStore extends BaseDocumentStore {
         );
       }
       const nodeKey = doc.id_;
-      const data = docToJson(doc);
+      const data = docToJson(doc, this.serializer);
       await this.kvstore.put(nodeKey, data, this.nodeCollection);
       const metadata: DocMetaData = { docHash: doc.hash };
 
@@ -94,7 +94,7 @@ export class KVDocumentStore extends BaseDocumentStore {
     if (!isValidDocJson(json)) {
       throw new Error(`Invalid JSON for docId ${docId}`);
     }
-    return jsonToDoc(json);
+    return jsonToDoc(json, this.serializer);
   }
 
   async getRefDocInfo(refDocId: string): Promise<RefDocInfo | undefined> {
diff --git a/packages/llamaindex/src/storage/docStore/PostgresDocumentStore.ts b/packages/llamaindex/src/storage/docStore/PostgresDocumentStore.ts
index 2a221d953..a910b4a29 100644
--- a/packages/llamaindex/src/storage/docStore/PostgresDocumentStore.ts
+++ b/packages/llamaindex/src/storage/docStore/PostgresDocumentStore.ts
@@ -4,6 +4,7 @@ import {
   type PostgresKVStoreConfig,
 } from "../kvStore/PostgresKVStore.js";
 import { KVDocumentStore } from "./KVDocumentStore.js";
+import { noneSerializer } from "./utils.js";
 
 const DEFAULT_TABLE_NAME = "llamaindex_doc_store";
 
@@ -12,6 +13,8 @@ export type PostgresDocumentStoreConfig = PostgresKVStoreConfig & {
 };
 
 export class PostgresDocumentStore extends KVDocumentStore {
+  serializer = noneSerializer;
+
   constructor(config?: PostgresDocumentStoreConfig) {
     const kvStore = new PostgresKVStore({
       schemaName: config?.schemaName,
diff --git a/packages/llamaindex/src/storage/docStore/types.ts b/packages/llamaindex/src/storage/docStore/types.ts
index 8125322e5..b580d5e17 100644
--- a/packages/llamaindex/src/storage/docStore/types.ts
+++ b/packages/llamaindex/src/storage/docStore/types.ts
@@ -3,6 +3,7 @@ import {
   DEFAULT_PERSIST_DIR,
 } from "@llamaindex/core/global";
 import { BaseNode } from "@llamaindex/core/schema";
+import { jsonSerializer, type Serializer } from "./utils.js";
 
 const defaultPersistPath = `${DEFAULT_PERSIST_DIR}/${DEFAULT_DOC_STORE_PERSIST_FILENAME}`;
 
@@ -12,6 +13,8 @@ export interface RefDocInfo {
 }
 
 export abstract class BaseDocumentStore {
+  serializer: Serializer<any> = jsonSerializer;
+
   // Save/load
   persist(persistPath: string = defaultPersistPath): void {
     // Persist the docstore to a file.
diff --git a/packages/llamaindex/src/storage/docStore/utils.ts b/packages/llamaindex/src/storage/docStore/utils.ts
index 1735b81ee..68596a0b1 100644
--- a/packages/llamaindex/src/storage/docStore/utils.ts
+++ b/packages/llamaindex/src/storage/docStore/utils.ts
@@ -4,12 +4,35 @@ import { Document, ObjectType, TextNode } from "@llamaindex/core/schema";
 const TYPE_KEY = "__type__";
 const DATA_KEY = "__data__";
 
-type DocJson = {
+export interface Serializer<T> {
+  toPersistence(data: Record<string, unknown>): T;
+  fromPersistence(data: T): Record<string, unknown>;
+}
+
+export const jsonSerializer: Serializer<string> = {
+  toPersistence(data) {
+    return JSON.stringify(data);
+  },
+  fromPersistence(data) {
+    return JSON.parse(data);
+  },
+};
+
+export const noneSerializer: Serializer<Record<string, unknown>> = {
+  toPersistence(data) {
+    return data;
+  },
+  fromPersistence(data) {
+    return data;
+  },
+};
+
+type DocJson<Data> = {
   [TYPE_KEY]: ObjectType;
-  [DATA_KEY]: string;
+  [DATA_KEY]: Data;
 };
 
-export function isValidDocJson(docJson: any): docJson is DocJson {
+export function isValidDocJson(docJson: any): docJson is DocJson<unknown> {
   return (
     typeof docJson === "object" &&
     docJson !== null &&
@@ -18,16 +41,22 @@ export function isValidDocJson(docJson: any): docJson is DocJson {
   );
 }
 
-export function docToJson(doc: BaseNode): DocJson {
+export function docToJson(
+  doc: BaseNode,
+  serializer: Serializer<unknown>,
+): DocJson<unknown> {
   return {
-    [DATA_KEY]: JSON.stringify(doc.toJSON()),
+    [DATA_KEY]: serializer.toPersistence(doc.toJSON()),
     [TYPE_KEY]: doc.type,
   };
 }
 
-export function jsonToDoc(docDict: DocJson): BaseNode {
+export function jsonToDoc<Data>(
+  docDict: DocJson<Data>,
+  serializer: Serializer<Data>,
+): BaseNode {
   const docType = docDict[TYPE_KEY];
-  const dataDict = JSON.parse(docDict[DATA_KEY]);
+  const dataDict = serializer.fromPersistence(docDict[DATA_KEY]) as any;
   let doc: BaseNode;
 
   if (docType === ObjectType.DOCUMENT) {
-- 
GitLab