From 539ec0fe3dd27206720221beb786b4c86d50df0b Mon Sep 17 00:00:00 2001
From: Yi Ding <yi.s.ding@gmail.com>
Date: Thu, 6 Jul 2023 21:41:13 -0700
Subject: [PATCH] try smaller chunks

---
 apps/simple/chatEngine.ts           |  9 ++++-
 apps/simple/openai.ts               |  1 +
 packages/core/src/NodeParser.ts     | 56 +++++++++++++++++++++++------
 packages/core/src/ServiceContext.ts |  7 +++-
 4 files changed, 61 insertions(+), 12 deletions(-)

diff --git a/apps/simple/chatEngine.ts b/apps/simple/chatEngine.ts
index 26ff083d5..9b31847d9 100644
--- a/apps/simple/chatEngine.ts
+++ b/apps/simple/chatEngine.ts
@@ -6,11 +6,18 @@ import { Document } from "@llamaindex/core/src/Node";
 import { VectorStoreIndex } from "@llamaindex/core/src/BaseIndex";
 import { ContextChatEngine } from "@llamaindex/core/src/ChatEngine";
 import essay from "./essay";
+import { serviceContextFromDefaults } from "@llamaindex/core/src/ServiceContext";
 
 async function main() {
   const document = new Document({ text: essay });
-  const index = await VectorStoreIndex.fromDocuments([document]);
+  const serviceContext = serviceContextFromDefaults({ chunkSize: 512 });
+  const index = await VectorStoreIndex.fromDocuments(
+    [document],
+    undefined,
+    serviceContext
+  );
   const retriever = index.asRetriever();
+  retriever.similarityTopK = 5;
   const chatEngine = new ContextChatEngine({ retriever });
   const rl = readline.createInterface({ input, output });
 
diff --git a/apps/simple/openai.ts b/apps/simple/openai.ts
index 0977b8d35..200ed8ea3 100644
--- a/apps/simple/openai.ts
+++ b/apps/simple/openai.ts
@@ -1,3 +1,4 @@
+// @ts-ignore
 import process from "node:process";
 import { Configuration, OpenAIWrapper } from "@llamaindex/core/src/openai";
 
diff --git a/packages/core/src/NodeParser.ts b/packages/core/src/NodeParser.ts
index 7f191e222..f3e173f94 100644
--- a/packages/core/src/NodeParser.ts
+++ b/packages/core/src/NodeParser.ts
@@ -1,5 +1,6 @@
 import { Document, NodeRelationship, TextNode } from "./Node";
 import { SentenceSplitter } from "./TextSplitter";
+import { DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE } from "./constants";
 
 export function getTextSplitsFromDocument(
   document: Document,
@@ -13,18 +14,36 @@ export function getTextSplitsFromDocument(
 
 export function getNodesFromDocument(
   document: Document,
-  textSplitter: SentenceSplitter
+  textSplitter: SentenceSplitter,
+  includeMetadata: boolean = true,
+  includePrevNextRel: boolean = true
 ) {
   let nodes: TextNode[] = [];
 
   const textSplits = getTextSplitsFromDocument(document, textSplitter);
 
   textSplits.forEach((textSplit) => {
-    const node = new TextNode({ text: textSplit });
+    const node = new TextNode({
+      text: textSplit,
+      metadata: includeMetadata ? document.metadata : {},
+    });
     node.relationships[NodeRelationship.SOURCE] = document.asRelatedNodeInfo();
     nodes.push(node);
   });
 
+  if (includePrevNextRel) {
+    nodes.forEach((node, index) => {
+      if (index > 0) {
+        node.relationships[NodeRelationship.PREVIOUS] =
+          nodes[index - 1].asRelatedNodeInfo();
+      }
+      if (index < nodes.length - 1) {
+        node.relationships[NodeRelationship.NEXT] =
+          nodes[index + 1].asRelatedNodeInfo();
+      }
+    });
+  }
+
   return nodes;
 }
 
@@ -33,17 +52,34 @@ export interface NodeParser {
 }
 export class SimpleNodeParser implements NodeParser {
   textSplitter: SentenceSplitter;
+  includeMetadata: boolean;
+  includePrevNextRel: boolean;
+
+  constructor(init?: {
+    textSplitter?: SentenceSplitter;
+    includeMetadata?: boolean;
+    includePrevNextRel?: boolean;
 
-  constructor(
-    textSplitter: any = null,
-    includeExtraInfo: boolean = true,
-    includePrevNextRel: boolean = true
-  ) {
-    this.textSplitter = textSplitter ?? new SentenceSplitter();
+    chunkSize?: number;
+    chunkOverlap?: number;
+  }) {
+    this.textSplitter =
+      init?.textSplitter ??
+      new SentenceSplitter(
+        init?.chunkSize ?? DEFAULT_CHUNK_SIZE,
+        init?.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP
+      );
+    this.includeMetadata = init?.includeMetadata ?? true;
+    this.includePrevNextRel = init?.includePrevNextRel ?? true;
   }
 
-  static fromDefaults(): SimpleNodeParser {
-    return new SimpleNodeParser();
+  static fromDefaults(init?: {
+    chunkSize?: number;
+    chunkOverlap?: number;
+    includeMetadata?: boolean;
+    includePrevNextRel?: boolean;
+  }): SimpleNodeParser {
+    return new SimpleNodeParser(init);
   }
 
   /**
diff --git a/packages/core/src/ServiceContext.ts b/packages/core/src/ServiceContext.ts
index dd72c771f..9df16f9dc 100644
--- a/packages/core/src/ServiceContext.ts
+++ b/packages/core/src/ServiceContext.ts
@@ -28,7 +28,12 @@ export function serviceContextFromDefaults(options?: ServiceContextOptions) {
   const serviceContext: ServiceContext = {
     llmPredictor: options?.llmPredictor ?? new ChatGPTLLMPredictor(),
     embedModel: options?.embedModel ?? new OpenAIEmbedding(),
-    nodeParser: options?.nodeParser ?? new SimpleNodeParser(),
+    nodeParser:
+      options?.nodeParser ??
+      new SimpleNodeParser({
+        chunkSize: options?.chunkSize,
+        chunkOverlap: options?.chunkOverlap,
+      }),
     promptHelper: options?.promptHelper ?? new PromptHelper(),
   };
 
-- 
GitLab