diff --git a/.changeset/strong-plums-burn.md b/.changeset/strong-plums-burn.md new file mode 100644 index 0000000000000000000000000000000000000000..18cd1fca8403e1fcf216a1e329ac3ecdde895d74 --- /dev/null +++ b/.changeset/strong-plums-burn.md @@ -0,0 +1,5 @@ +--- +"llamaindex": patch +--- + +feat: markdown node parser diff --git a/apps/docs/docs/modules/node_parser.md b/apps/docs/docs/modules/node_parser.md index 9bf3fc304e68e40a01952fd583e661c5e6027a8d..b7ed346b89280a17359ab75952037d83d4dee94b 100644 --- a/apps/docs/docs/modules/node_parser.md +++ b/apps/docs/docs/modules/node_parser.md @@ -27,6 +27,71 @@ const splitter = new SentenceSplitter({ chunkSize: 1 }); const textSplits = splitter.splitText("Hello World"); ``` +## MarkdownNodeParser + +The `MarkdownNodeParser` is a more advanced `NodeParser` that can handle markdown documents. It will split the markdown into nodes and then parse the nodes into a `Document` object. + +```typescript +import { MarkdownNodeParser } from "llamaindex"; + +const nodeParser = new MarkdownNodeParser(); + +const nodes = nodeParser.getNodesFromDocuments([ + new Document({ + text: `# Main Header +Main content + +# Header 2 +Header 2 content + +## Sub-header +Sub-header content + + `, + }), +]); +``` + +The output metadata will be something like: + +```bash +[ + TextNode { + id_: '008e41a8-b097-487c-bee8-bd88b9455844', + metadata: { 'Header 1': 'Main Header' }, + excludedEmbedMetadataKeys: [], + excludedLlmMetadataKeys: [], + relationships: { PARENT: [Array] }, + hash: 'KJ5e/um/RkHaNR6bonj9ormtZY7I8i4XBPVYHXv1A5M=', + text: 'Main Header\nMain content', + textTemplate: '', + metadataSeparator: '\n' + }, + TextNode { + id_: '0f5679b3-ba63-4aff-aedc-830c4208d0b5', + metadata: { 'Header 1': 'Header 2' }, + excludedEmbedMetadataKeys: [], + excludedLlmMetadataKeys: [], + relationships: { PARENT: [Array] }, + hash: 'IP/g/dIld3DcbK+uHzDpyeZ9IdOXY4brxhOIe7wc488=', + text: 'Header 2\nHeader 2 content', + textTemplate: '', + metadataSeparator: '\n' + }, + TextNode { + id_: 'e81e9bd0-121c-4ead-8ca7-1639d65fdf90', + metadata: { 'Header 1': 'Header 2', 'Header 2': 'Sub-header' }, + excludedEmbedMetadataKeys: [], + excludedLlmMetadataKeys: [], + relationships: { PARENT: [Array] }, + hash: 'B3kYNnxaYi9ghtAgwza0ZEVKF4MozobkNUlcekDL7JQ=', + text: 'Sub-header\nSub-header content', + textTemplate: '', + metadataSeparator: '\n' + } +] +``` + ## API Reference - [SimpleNodeParser](../api/classes/SimpleNodeParser.md) diff --git a/examples/nodeParser/MarkdownNodeParser.ts b/examples/nodeParser/MarkdownNodeParser.ts new file mode 100644 index 0000000000000000000000000000000000000000..3ff7b4d99b21f105b2e48edc196b2f009c03616a --- /dev/null +++ b/examples/nodeParser/MarkdownNodeParser.ts @@ -0,0 +1,24 @@ +import { Document, MarkdownNodeParser } from "llamaindex"; + +async function main() { + const markdownParser = new MarkdownNodeParser(); + + const splits = markdownParser.getNodesFromDocuments([ + new Document({ + text: `# Main Header +Main content + +# Header 2 +Header 2 content + +## Sub-header +Sub-header content + +`, + }), + ]); + + console.log(splits); +} + +main(); diff --git a/packages/core/src/nodeParsers/MarkdownNodeParser.ts b/packages/core/src/nodeParsers/MarkdownNodeParser.ts new file mode 100644 index 0000000000000000000000000000000000000000..249bd12c8e5e4549a33bf19adbd92f113c6c4cc3 --- /dev/null +++ b/packages/core/src/nodeParsers/MarkdownNodeParser.ts @@ -0,0 +1,108 @@ +import { BaseNode, Metadata, MetadataMode, TextNode } from "../Node"; +import { NodeParser } from "./types"; + +export class MarkdownNodeParser implements NodeParser { + includeMetadata: boolean; + includePrevNextRel: boolean; + + constructor(init?: { + includeMetadata?: boolean; + includePrevNextRel?: boolean; + }) { + this.includeMetadata = init?.includeMetadata ?? true; + this.includePrevNextRel = init?.includePrevNextRel ?? true; + } + + async transform(nodes: BaseNode[], _options?: any): Promise<BaseNode[]> { + return this.getNodesFromDocuments(nodes); + } + + static fromDefaults(init?: { + includeMetadata?: boolean; + includePrevNextRel?: boolean; + }): MarkdownNodeParser { + return new MarkdownNodeParser(init); + } + + buildNodeFromSplit( + textSplit: string, + node: BaseNode<Metadata>, + metadata: Metadata, + ): BaseNode<Metadata> { + const newNode = new TextNode({ + text: textSplit, + relationships: { + PARENT: [ + { + ...node, + nodeId: node.id_, + }, + ], + }, + metadata: this.includeMetadata ? metadata : {}, + }); + return newNode; + } + + updateMetadata( + headersMetadata: Metadata, + newHeader: string, + newHeaderLevel: number, + ): Metadata { + const updatedHeaders: Metadata = {}; + for (let i = 1; i < newHeaderLevel; i++) { + const key = `Header ${i}`; + if (key in headersMetadata) { + updatedHeaders[key] = headersMetadata[key]; + } + } + updatedHeaders[`Header ${newHeaderLevel}`] = newHeader; + return updatedHeaders; + } + + getNodesFromNode(node: BaseNode<Metadata>): BaseNode<Metadata>[] { + const text = node.getContent(MetadataMode.NONE); + const markdownNodes: BaseNode<Metadata>[] = []; + const lines = text.split("\n"); + let metadata: Metadata = {}; + let codeBlock = false; + let currentSection = ""; + + for (const line of lines) { + if (line.startsWith("```")) { + codeBlock = !codeBlock; + } + const headerMatch = line.match(/^(#+)\s(.*)/); + if (headerMatch && !codeBlock) { + if (currentSection !== "") { + markdownNodes.push( + this.buildNodeFromSplit(currentSection.trim(), node, metadata), + ); + } + metadata = this.updateMetadata( + metadata, + headerMatch[2], + headerMatch[1].length, + ); + currentSection = `${headerMatch[2]}\n`; + } else { + currentSection += line + "\n"; + } + } + + markdownNodes.push( + this.buildNodeFromSplit(currentSection.trim(), node, metadata), + ); + + return markdownNodes; + } + + getNodesFromDocuments(documents: BaseNode<Metadata>[]): BaseNode<Metadata>[] { + let allNodes: BaseNode<Metadata>[] = []; + for (const node of documents) { + const nodes = this.getNodesFromNode(node); + allNodes = allNodes.concat(nodes); + } + return allNodes; + } +} diff --git a/packages/core/src/nodeParsers/index.ts b/packages/core/src/nodeParsers/index.ts index 0507f22c1dbc8c9294194e97d8849cf4514fa129..e1ef8f7c5ced0bebb2715bdf9d8a3a317ea28db2 100644 --- a/packages/core/src/nodeParsers/index.ts +++ b/packages/core/src/nodeParsers/index.ts @@ -1,3 +1,4 @@ +export * from "./MarkdownNodeParser"; export * from "./SentenceWindowNodeParser"; export * from "./SimpleNodeParser"; export * from "./types"; diff --git a/packages/core/src/tests/nodeParsers/MarkdownNodeParser.test.ts b/packages/core/src/tests/nodeParsers/MarkdownNodeParser.test.ts new file mode 100644 index 0000000000000000000000000000000000000000..456b110d169301bd0e9defb58d6bc5827e6a23e0 --- /dev/null +++ b/packages/core/src/tests/nodeParsers/MarkdownNodeParser.test.ts @@ -0,0 +1,103 @@ +import { Document, MetadataMode } from "../../Node"; +import { MarkdownNodeParser } from "../../nodeParsers"; + +describe("MarkdownNodeParser", () => { + test("test_header_splits", () => { + const markdownParser = new MarkdownNodeParser(); + + const splits = markdownParser.getNodesFromDocuments([ + new Document({ + text: `# Main Header + +Header 1 content + +# Header 2 +Header 2 content + `, + }), + ]); + + expect(splits.length).toBe(2); + expect(splits[0].metadata).toEqual({ "Header 1": "Main Header" }); + expect(splits[1].metadata).toEqual({ "Header 1": "Header 2" }); + expect(splits[0].getContent(MetadataMode.NONE)).toStrictEqual( + "Main Header\n\nHeader 1 content", + ); + expect(splits[1].getContent(MetadataMode.NONE)).toStrictEqual( + "Header 2\nHeader 2 content", + ); + }); + + test("test_non_header_splits", () => { + const markdownParser = new MarkdownNodeParser(); + + const splits = markdownParser.getNodesFromDocuments([ + new Document({ + text: `# Header 1 + +#Not a header + +Also # not a header + + # Still not a header + `, + }), + ]); + expect(splits.length).toBe(1); + }); + + test("test_pre_header_content", () => { + const markdownParser = new MarkdownNodeParser(); + + const splits = markdownParser.getNodesFromDocuments([ + new Document({ + text: ` + +pre-header content + +# Header 1 + +Content + +## Sub-header + `, + }), + ]); + expect(splits.length).toBe(3); + }); + + test("test_header_metadata", () => { + const markdownParser = new MarkdownNodeParser(); + + const splits = markdownParser.getNodesFromDocuments([ + new Document({ + text: `# Main Header + +Content + +## Sub-header + +Content + +### Sub-sub header + +Content + +# New title + `, + }), + ]); + expect(splits.length).toBe(4); + expect(splits[0].metadata).toEqual({ "Header 1": "Main Header" }); + expect(splits[1].metadata).toEqual({ + "Header 1": "Main Header", + "Header 2": "Sub-header", + }); + expect(splits[2].metadata).toEqual({ + "Header 1": "Main Header", + "Header 2": "Sub-header", + "Header 3": "Sub-sub header", + }); + expect(splits[3].metadata).toEqual({ "Header 1": "New title" }); + }); +});