Skip to content
Snippets Groups Projects
Commit 6fc6a499 authored by swk777's avatar swk777 Committed by Yi Ding
Browse files

add markdown reader

parent 293b83c3
No related branches found
No related tags found
No related merge requests found
import { MarkdownReader, VectorStoreIndex } from "llamaindex";
async function main() {
// Load PDF
const reader = new MarkdownReader();
const documents = await reader.loadData("node_modules/llamaindex/README.md");
// Split text and create embeddings. Store them in a VectorStoreIndex
const index = await VectorStoreIndex.fromDocuments(documents);
// Query the index
const queryEngine = index.asQueryEngine();
const response = await queryEngine.query("What does the example code do?");
// Output response
console.log(response.toString());
}
main().catch(console.error);
...@@ -23,6 +23,7 @@ export * from "./callbacks/CallbackManager"; ...@@ -23,6 +23,7 @@ export * from "./callbacks/CallbackManager";
export * from "./readers/base"; export * from "./readers/base";
export * from "./readers/PDFReader"; export * from "./readers/PDFReader";
export * from "./readers/CSVReader"; export * from "./readers/CSVReader";
export * from "./readers/MarkdownReader";
export * from "./readers/SimpleDirectoryReader"; export * from "./readers/SimpleDirectoryReader";
export * from "./storage"; export * from "./storage";
import { Document } from "../Node";
import { DEFAULT_FS, GenericFileSystem } from "../storage";
import { BaseReader } from "./base";
type MarkdownTuple = [string | null, string];
/**
* Extract text from markdown files.
* Returns dictionary with keys as headers and values as the text between headers.
*/
export class MarkdownReader implements BaseReader {
private _removeHyperlinks: boolean;
private _removeImages: boolean;
/**
* @param {boolean} [removeHyperlinks=true] - Indicates whether hyperlinks should be removed.
* @param {boolean} [removeImages=true] - Indicates whether images should be removed.
*/
constructor(removeHyperlinks: boolean = true, removeImages: boolean = true) {
this._removeHyperlinks = removeHyperlinks;
this._removeImages = removeImages;
}
/**
* Convert a markdown file to a dictionary.
* The keys are the headers and the values are the text under each header.
* @param {string} markdownText - The markdown text to convert.
* @returns {Array<MarkdownTuple>} - An array of tuples, where each tuple contains a header (or null) and its corresponding text.
*/
markdownToTups(markdownText: string): MarkdownTuple[] {
const markdownTups: MarkdownTuple[] = [];
const lines = markdownText.split("\n");
let currentHeader: string | null = null;
let currentText = "";
for (const line of lines) {
const headerMatch = line.match(/^#+\s/);
if (headerMatch) {
if (currentHeader) {
if (!currentText) {
currentHeader += line + "\n";
continue;
}
markdownTups.push([currentHeader, currentText]);
}
currentHeader = line;
currentText = "";
} else {
currentText += line + "\n";
}
}
markdownTups.push([currentHeader, currentText]);
if (currentHeader) {
// pass linting, assert keys are defined
markdownTups.map((tuple) => [
tuple[0]?.replace(/#/g, "").trim() || null,
tuple[1].replace(/<.*?>/g, ""),
]);
} else {
markdownTups.map((tuple) => [tuple[0], tuple[1].replace(/<.*?>/g, "")]);
}
return markdownTups;
}
removeImages(content: string): string {
const pattern = /!{1}\[\[(.*)\]\]/g;
return content.replace(pattern, "");
}
removeHyperlinks(content: string): string {
const pattern = /\[(.*?)\]\((.*?)\)/g;
return content.replace(pattern, "$1");
}
parseTups(content: string): MarkdownTuple[] {
let modifiedContent = content;
if (this._removeHyperlinks) {
modifiedContent = this.removeHyperlinks(modifiedContent);
}
if (this._removeImages) {
modifiedContent = this.removeImages(modifiedContent);
}
return this.markdownToTups(modifiedContent);
}
async loadData(
file: string,
fs: GenericFileSystem = DEFAULT_FS,
): Promise<Document[]> {
const content = await fs.readFile(file, { encoding: "utf-8" });
const tups = this.parseTups(content);
const results: Document[] = [];
for (const [header, value] of tups) {
if (header) {
results.push(
new Document({
text: `\n\n${header}\n${value}`,
}),
);
} else {
results.push(new Document({ text: value }));
}
}
return results;
}
}
...@@ -5,6 +5,7 @@ import { CompleteFileSystem, walk } from "../storage/FileSystem"; ...@@ -5,6 +5,7 @@ import { CompleteFileSystem, walk } from "../storage/FileSystem";
import { DEFAULT_FS } from "../storage/constants"; import { DEFAULT_FS } from "../storage/constants";
import { PDFReader } from "./PDFReader"; import { PDFReader } from "./PDFReader";
import { PapaCSVReader } from "./CSVReader"; import { PapaCSVReader } from "./CSVReader";
import { MarkdownReader } from "./MarkdownReader";
/** /**
* Read a .txt file * Read a .txt file
...@@ -23,6 +24,7 @@ const FILE_EXT_TO_READER: Record<string, BaseReader> = { ...@@ -23,6 +24,7 @@ const FILE_EXT_TO_READER: Record<string, BaseReader> = {
txt: new TextFileReader(), txt: new TextFileReader(),
pdf: new PDFReader(), pdf: new PDFReader(),
csv: new PapaCSVReader(), csv: new PapaCSVReader(),
md: new MarkdownReader(),
}; };
export type SimpleDirectoryReaderLoadDataProps = { export type SimpleDirectoryReaderLoadDataProps = {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment