From 539ec0fe3dd27206720221beb786b4c86d50df0b Mon Sep 17 00:00:00 2001 From: Yi Ding <yi.s.ding@gmail.com> Date: Thu, 6 Jul 2023 21:41:13 -0700 Subject: [PATCH] try smaller chunks --- apps/simple/chatEngine.ts | 9 ++++- apps/simple/openai.ts | 1 + packages/core/src/NodeParser.ts | 56 +++++++++++++++++++++++------ packages/core/src/ServiceContext.ts | 7 +++- 4 files changed, 61 insertions(+), 12 deletions(-) diff --git a/apps/simple/chatEngine.ts b/apps/simple/chatEngine.ts index 26ff083d5..9b31847d9 100644 --- a/apps/simple/chatEngine.ts +++ b/apps/simple/chatEngine.ts @@ -6,11 +6,18 @@ import { Document } from "@llamaindex/core/src/Node"; import { VectorStoreIndex } from "@llamaindex/core/src/BaseIndex"; import { ContextChatEngine } from "@llamaindex/core/src/ChatEngine"; import essay from "./essay"; +import { serviceContextFromDefaults } from "@llamaindex/core/src/ServiceContext"; async function main() { const document = new Document({ text: essay }); - const index = await VectorStoreIndex.fromDocuments([document]); + const serviceContext = serviceContextFromDefaults({ chunkSize: 512 }); + const index = await VectorStoreIndex.fromDocuments( + [document], + undefined, + serviceContext + ); const retriever = index.asRetriever(); + retriever.similarityTopK = 5; const chatEngine = new ContextChatEngine({ retriever }); const rl = readline.createInterface({ input, output }); diff --git a/apps/simple/openai.ts b/apps/simple/openai.ts index 0977b8d35..200ed8ea3 100644 --- a/apps/simple/openai.ts +++ b/apps/simple/openai.ts @@ -1,3 +1,4 @@ +// @ts-ignore import process from "node:process"; import { Configuration, OpenAIWrapper } from "@llamaindex/core/src/openai"; diff --git a/packages/core/src/NodeParser.ts b/packages/core/src/NodeParser.ts index 7f191e222..f3e173f94 100644 --- a/packages/core/src/NodeParser.ts +++ b/packages/core/src/NodeParser.ts @@ -1,5 +1,6 @@ import { Document, NodeRelationship, TextNode } from "./Node"; import { SentenceSplitter } from "./TextSplitter"; +import { DEFAULT_CHUNK_OVERLAP, DEFAULT_CHUNK_SIZE } from "./constants"; export function getTextSplitsFromDocument( document: Document, @@ -13,18 +14,36 @@ export function getTextSplitsFromDocument( export function getNodesFromDocument( document: Document, - textSplitter: SentenceSplitter + textSplitter: SentenceSplitter, + includeMetadata: boolean = true, + includePrevNextRel: boolean = true ) { let nodes: TextNode[] = []; const textSplits = getTextSplitsFromDocument(document, textSplitter); textSplits.forEach((textSplit) => { - const node = new TextNode({ text: textSplit }); + const node = new TextNode({ + text: textSplit, + metadata: includeMetadata ? document.metadata : {}, + }); node.relationships[NodeRelationship.SOURCE] = document.asRelatedNodeInfo(); nodes.push(node); }); + if (includePrevNextRel) { + nodes.forEach((node, index) => { + if (index > 0) { + node.relationships[NodeRelationship.PREVIOUS] = + nodes[index - 1].asRelatedNodeInfo(); + } + if (index < nodes.length - 1) { + node.relationships[NodeRelationship.NEXT] = + nodes[index + 1].asRelatedNodeInfo(); + } + }); + } + return nodes; } @@ -33,17 +52,34 @@ export interface NodeParser { } export class SimpleNodeParser implements NodeParser { textSplitter: SentenceSplitter; + includeMetadata: boolean; + includePrevNextRel: boolean; + + constructor(init?: { + textSplitter?: SentenceSplitter; + includeMetadata?: boolean; + includePrevNextRel?: boolean; - constructor( - textSplitter: any = null, - includeExtraInfo: boolean = true, - includePrevNextRel: boolean = true - ) { - this.textSplitter = textSplitter ?? new SentenceSplitter(); + chunkSize?: number; + chunkOverlap?: number; + }) { + this.textSplitter = + init?.textSplitter ?? + new SentenceSplitter( + init?.chunkSize ?? DEFAULT_CHUNK_SIZE, + init?.chunkOverlap ?? DEFAULT_CHUNK_OVERLAP + ); + this.includeMetadata = init?.includeMetadata ?? true; + this.includePrevNextRel = init?.includePrevNextRel ?? true; } - static fromDefaults(): SimpleNodeParser { - return new SimpleNodeParser(); + static fromDefaults(init?: { + chunkSize?: number; + chunkOverlap?: number; + includeMetadata?: boolean; + includePrevNextRel?: boolean; + }): SimpleNodeParser { + return new SimpleNodeParser(init); } /** diff --git a/packages/core/src/ServiceContext.ts b/packages/core/src/ServiceContext.ts index dd72c771f..9df16f9dc 100644 --- a/packages/core/src/ServiceContext.ts +++ b/packages/core/src/ServiceContext.ts @@ -28,7 +28,12 @@ export function serviceContextFromDefaults(options?: ServiceContextOptions) { const serviceContext: ServiceContext = { llmPredictor: options?.llmPredictor ?? new ChatGPTLLMPredictor(), embedModel: options?.embedModel ?? new OpenAIEmbedding(), - nodeParser: options?.nodeParser ?? new SimpleNodeParser(), + nodeParser: + options?.nodeParser ?? + new SimpleNodeParser({ + chunkSize: options?.chunkSize, + chunkOverlap: options?.chunkOverlap, + }), promptHelper: options?.promptHelper ?? new PromptHelper(), }; -- GitLab