diff --git a/examples/markdown.ts b/examples/markdown.ts new file mode 100644 index 0000000000000000000000000000000000000000..cc71c69541b2b2b19e3c47c3f9487af0eb278137 --- /dev/null +++ b/examples/markdown.ts @@ -0,0 +1,20 @@ +import { MarkdownReader, VectorStoreIndex } from "llamaindex"; + +async function main() { + // Load PDF + const reader = new MarkdownReader(); + const documents = await reader.loadData("node_modules/llamaindex/README.md"); + + // Split text and create embeddings. Store them in a VectorStoreIndex + const index = await VectorStoreIndex.fromDocuments(documents); + + // Query the index + const queryEngine = index.asQueryEngine(); + + const response = await queryEngine.query("What does the example code do?"); + + // Output response + console.log(response.toString()); +} + +main().catch(console.error); diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index c1949db79408b23a202a6c25cae849bf9c7c0c2a..f650f8fbb8098dfe9636b1137546361b67f2711d 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -23,6 +23,7 @@ export * from "./callbacks/CallbackManager"; export * from "./readers/base"; export * from "./readers/PDFReader"; export * from "./readers/CSVReader"; +export * from "./readers/MarkdownReader"; export * from "./readers/SimpleDirectoryReader"; export * from "./storage"; diff --git a/packages/core/src/readers/MarkdownReader.ts b/packages/core/src/readers/MarkdownReader.ts new file mode 100644 index 0000000000000000000000000000000000000000..63adfd7cda6c68776c5983c80aa945a93a331f1c --- /dev/null +++ b/packages/core/src/readers/MarkdownReader.ts @@ -0,0 +1,110 @@ +import { Document } from "../Node"; +import { DEFAULT_FS, GenericFileSystem } from "../storage"; +import { BaseReader } from "./base"; + +type MarkdownTuple = [string | null, string]; + +/** + * Extract text from markdown files. + * Returns dictionary with keys as headers and values as the text between headers. + */ +export class MarkdownReader implements BaseReader { + private _removeHyperlinks: boolean; + private _removeImages: boolean; + + /** + * @param {boolean} [removeHyperlinks=true] - Indicates whether hyperlinks should be removed. + * @param {boolean} [removeImages=true] - Indicates whether images should be removed. + */ + constructor(removeHyperlinks: boolean = true, removeImages: boolean = true) { + this._removeHyperlinks = removeHyperlinks; + this._removeImages = removeImages; + } + + /** + * Convert a markdown file to a dictionary. + * The keys are the headers and the values are the text under each header. + * @param {string} markdownText - The markdown text to convert. + * @returns {Array<MarkdownTuple>} - An array of tuples, where each tuple contains a header (or null) and its corresponding text. + */ + markdownToTups(markdownText: string): MarkdownTuple[] { + const markdownTups: MarkdownTuple[] = []; + const lines = markdownText.split("\n"); + + let currentHeader: string | null = null; + let currentText = ""; + + for (const line of lines) { + const headerMatch = line.match(/^#+\s/); + if (headerMatch) { + if (currentHeader) { + if (!currentText) { + currentHeader += line + "\n"; + continue; + } + markdownTups.push([currentHeader, currentText]); + } + + currentHeader = line; + currentText = ""; + } else { + currentText += line + "\n"; + } + } + markdownTups.push([currentHeader, currentText]); + + if (currentHeader) { + // pass linting, assert keys are defined + markdownTups.map((tuple) => [ + tuple[0]?.replace(/#/g, "").trim() || null, + tuple[1].replace(/<.*?>/g, ""), + ]); + } else { + markdownTups.map((tuple) => [tuple[0], tuple[1].replace(/<.*?>/g, "")]); + } + + return markdownTups; + } + + removeImages(content: string): string { + const pattern = /!{1}\[\[(.*)\]\]/g; + return content.replace(pattern, ""); + } + + removeHyperlinks(content: string): string { + const pattern = /\[(.*?)\]\((.*?)\)/g; + return content.replace(pattern, "$1"); + } + + parseTups(content: string): MarkdownTuple[] { + let modifiedContent = content; + if (this._removeHyperlinks) { + modifiedContent = this.removeHyperlinks(modifiedContent); + } + if (this._removeImages) { + modifiedContent = this.removeImages(modifiedContent); + } + return this.markdownToTups(modifiedContent); + } + + async loadData( + file: string, + fs: GenericFileSystem = DEFAULT_FS, + ): Promise<Document[]> { + const content = await fs.readFile(file, { encoding: "utf-8" }); + const tups = this.parseTups(content); + const results: Document[] = []; + for (const [header, value] of tups) { + if (header) { + results.push( + new Document({ + text: `\n\n${header}\n${value}`, + }), + ); + } else { + results.push(new Document({ text: value })); + } + } + return results; + } +} diff --git a/packages/core/src/readers/SimpleDirectoryReader.ts b/packages/core/src/readers/SimpleDirectoryReader.ts index 37b528ea07580f3d5c240d12e22c86d0af0effc4..8fe48a7f8da7c9d072e4c5192b6d535aff40b7d7 100644 --- a/packages/core/src/readers/SimpleDirectoryReader.ts +++ b/packages/core/src/readers/SimpleDirectoryReader.ts @@ -5,6 +5,7 @@ import { CompleteFileSystem, walk } from "../storage/FileSystem"; import { DEFAULT_FS } from "../storage/constants"; import { PDFReader } from "./PDFReader"; import { PapaCSVReader } from "./CSVReader"; +import { MarkdownReader } from "./MarkdownReader"; /** * Read a .txt file @@ -23,6 +24,7 @@ const FILE_EXT_TO_READER: Record<string, BaseReader> = { txt: new TextFileReader(), pdf: new PDFReader(), csv: new PapaCSVReader(), + md: new MarkdownReader(), }; export type SimpleDirectoryReaderLoadDataProps = {