Skip to content
Snippets Groups Projects
Unverified Commit fc0fdb5e authored by yisding's avatar yisding Committed by GitHub
Browse files

Merge pull request #82 from swk777/larry/md

add markdown reader
parents 293b83c3 9d6b2ed9
Branches
Tags
No related merge requests found
---
"llamaindex": patch
---
Added Markdown Reader (huge shoutout to @swk777)
import { MarkdownReader, VectorStoreIndex } from "llamaindex";
async function main() {
// Load Markdown file
const reader = new MarkdownReader();
const documents = await reader.loadData("node_modules/llamaindex/README.md");
// Split text and create embeddings. Store them in a VectorStoreIndex
const index = await VectorStoreIndex.fromDocuments(documents);
// Query the index
const queryEngine = index.asQueryEngine();
const response = await queryEngine.query("What does the example code do?");
// Output response
console.log(response.toString());
}
main().catch(console.error);
import { MarkdownReader, VectorStoreIndex } from "llamaindex";
async function main() {
// Load Markdown file
const reader = new MarkdownReader();
const documents = await reader.loadData("node_modules/llamaindex/README.md");
// Split text and create embeddings. Store them in a VectorStoreIndex
const index = await VectorStoreIndex.fromDocuments(documents);
// Query the index
const queryEngine = index.asQueryEngine();
const response = await queryEngine.query("What does the example code do?");
// Output response
console.log(response.toString());
}
main().catch(console.error);
...@@ -23,6 +23,7 @@ export * from "./callbacks/CallbackManager"; ...@@ -23,6 +23,7 @@ export * from "./callbacks/CallbackManager";
export * from "./readers/base"; export * from "./readers/base";
export * from "./readers/PDFReader"; export * from "./readers/PDFReader";
export * from "./readers/CSVReader"; export * from "./readers/CSVReader";
export * from "./readers/MarkdownReader";
export * from "./readers/SimpleDirectoryReader"; export * from "./readers/SimpleDirectoryReader";
export * from "./storage"; export * from "./storage";
import { Document } from "../Node";
import { DEFAULT_FS, GenericFileSystem } from "../storage";
import { BaseReader } from "./base";
type MarkdownTuple = [string | null, string];
/**
* Extract text from markdown files.
* Returns dictionary with keys as headers and values as the text between headers.
*/
export class MarkdownReader implements BaseReader {
private _removeHyperlinks: boolean;
private _removeImages: boolean;
/**
* @param {boolean} [removeHyperlinks=true] - Indicates whether hyperlinks should be removed.
* @param {boolean} [removeImages=true] - Indicates whether images should be removed.
*/
constructor(removeHyperlinks: boolean = true, removeImages: boolean = true) {
this._removeHyperlinks = removeHyperlinks;
this._removeImages = removeImages;
}
/**
* Convert a markdown file to a dictionary.
* The keys are the headers and the values are the text under each header.
* @param {string} markdownText - The markdown text to convert.
* @returns {Array<MarkdownTuple>} - An array of tuples, where each tuple contains a header (or null) and its corresponding text.
*/
markdownToTups(markdownText: string): MarkdownTuple[] {
const markdownTups: MarkdownTuple[] = [];
const lines = markdownText.split("\n");
let currentHeader: string | null = null;
let currentText = "";
for (const line of lines) {
const headerMatch = line.match(/^#+\s/);
if (headerMatch) {
if (currentHeader) {
if (!currentText) {
currentHeader += line + "\n";
continue;
}
markdownTups.push([currentHeader, currentText]);
}
currentHeader = line;
currentText = "";
} else {
currentText += line + "\n";
}
}
markdownTups.push([currentHeader, currentText]);
if (currentHeader) {
// pass linting, assert keys are defined
markdownTups.map((tuple) => [
tuple[0]?.replace(/#/g, "").trim() || null,
tuple[1].replace(/<.*?>/g, ""),
]);
} else {
markdownTups.map((tuple) => [tuple[0], tuple[1].replace(/<.*?>/g, "")]);
}
return markdownTups;
}
removeImages(content: string): string {
const pattern = /!{1}\[\[(.*)\]\]/g;
return content.replace(pattern, "");
}
removeHyperlinks(content: string): string {
const pattern = /\[(.*?)\]\((.*?)\)/g;
return content.replace(pattern, "$1");
}
parseTups(content: string): MarkdownTuple[] {
let modifiedContent = content;
if (this._removeHyperlinks) {
modifiedContent = this.removeHyperlinks(modifiedContent);
}
if (this._removeImages) {
modifiedContent = this.removeImages(modifiedContent);
}
return this.markdownToTups(modifiedContent);
}
async loadData(
file: string,
fs: GenericFileSystem = DEFAULT_FS,
): Promise<Document[]> {
const content = await fs.readFile(file, { encoding: "utf-8" });
const tups = this.parseTups(content);
const results: Document[] = [];
for (const [header, value] of tups) {
if (header) {
results.push(
new Document({
text: `\n\n${header}\n${value}`,
}),
);
} else {
results.push(new Document({ text: value }));
}
}
return results;
}
}
...@@ -5,6 +5,7 @@ import { CompleteFileSystem, walk } from "../storage/FileSystem"; ...@@ -5,6 +5,7 @@ import { CompleteFileSystem, walk } from "../storage/FileSystem";
import { DEFAULT_FS } from "../storage/constants"; import { DEFAULT_FS } from "../storage/constants";
import { PDFReader } from "./PDFReader"; import { PDFReader } from "./PDFReader";
import { PapaCSVReader } from "./CSVReader"; import { PapaCSVReader } from "./CSVReader";
import { MarkdownReader } from "./MarkdownReader";
/** /**
* Read a .txt file * Read a .txt file
...@@ -23,6 +24,7 @@ const FILE_EXT_TO_READER: Record<string, BaseReader> = { ...@@ -23,6 +24,7 @@ const FILE_EXT_TO_READER: Record<string, BaseReader> = {
txt: new TextFileReader(), txt: new TextFileReader(),
pdf: new PDFReader(), pdf: new PDFReader(),
csv: new PapaCSVReader(), csv: new PapaCSVReader(),
md: new MarkdownReader(),
}; };
export type SimpleDirectoryReaderLoadDataProps = { export type SimpleDirectoryReaderLoadDataProps = {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment