Skip to content
Snippets Groups Projects
Unverified Commit 72f62718 authored by yisding's avatar yisding Committed by GitHub
Browse files

Merge pull request #160 from mtutty/add-observable-reader

Add observer/callback feature to SimpleDirectoryReader
parents fe9056f0 e938a4d1
Branches
Tags
No related merge requests found
......@@ -3,6 +3,7 @@
# dependencies
node_modules
.pnp
.pnpm-store
.pnp.js
# testing
......
import { SimpleDirectoryReader } from "llamaindex";
function callback(
category: string,
name: string,
status: any,
message?: string,
): boolean {
console.log(category, name, status, message);
if (name.endsWith(".pdf")) {
console.log("I DON'T WANT PDF FILES!");
return false;
}
return true;
}
async function main() {
// Load page
const reader = new SimpleDirectoryReader(callback);
const params = { directoryPath: "./data" };
await reader.loadData(params);
}
main().catch(console.error);
import _ from "lodash";
import { Document } from "../Node";
import { DEFAULT_FS } from "../storage/constants";
import { CompleteFileSystem, walk } from "../storage/FileSystem";
import { BaseReader } from "./base";
import { DEFAULT_FS } from "../storage/constants";
import { PapaCSVReader } from "./CSVReader";
import { DocxReader } from "./DocxReader";
import { HTMLReader } from "./HTMLReader";
import { MarkdownReader } from "./MarkdownReader";
import { PDFReader } from "./PDFReader";
import { BaseReader } from "./base";
type ReaderCallback = (
category: "file" | "directory",
name: string,
status: ReaderStatus,
message?: string,
) => boolean;
enum ReaderStatus {
STARTED = 0,
COMPLETE,
ERROR,
}
/**
* Read a .txt file
......@@ -22,7 +34,7 @@ export class TextFileReader implements BaseReader {
}
}
const FILE_EXT_TO_READER: Record<string, BaseReader> = {
export const FILE_EXT_TO_READER: Record<string, BaseReader> = {
txt: new TextFileReader(),
pdf: new PDFReader(),
csv: new PapaCSVReader(),
......@@ -40,20 +52,37 @@ export type SimpleDirectoryReaderLoadDataProps = {
};
/**
* Read all of the documents in a directory. Currently supports PDF and TXT files.
* Read all of the documents in a directory.
* By default, supports the list of file types
* in the FILE_EXIT_TO_READER map.
*/
export class SimpleDirectoryReader implements BaseReader {
constructor(private observer?: ReaderCallback) {}
async loadData({
directoryPath,
fs = DEFAULT_FS as CompleteFileSystem,
defaultReader = new TextFileReader(),
fileExtToReader = FILE_EXT_TO_READER,
}: SimpleDirectoryReaderLoadDataProps): Promise<Document[]> {
// Observer can decide to skip the directory
if (
!this.doObserverCheck("directory", directoryPath, ReaderStatus.STARTED)
) {
return [];
}
let docs: Document[] = [];
for await (const filePath of walk(fs, directoryPath)) {
try {
const fileExt = _.last(filePath.split(".")) || "";
// Observer can decide to skip each file
if (!this.doObserverCheck("file", filePath, ReaderStatus.STARTED)) {
// Skip this file
continue;
}
let reader = null;
if (fileExt in fileExtToReader) {
......@@ -61,16 +90,52 @@ export class SimpleDirectoryReader implements BaseReader {
} else if (!_.isNil(defaultReader)) {
reader = defaultReader;
} else {
console.warn(`No reader for file extension of ${filePath}`);
const msg = `No reader for file extension of ${filePath}`;
console.warn(msg);
// In an error condition, observer's false cancels the whole process.
if (
!this.doObserverCheck("file", filePath, ReaderStatus.ERROR, msg)
) {
return [];
}
continue;
}
const fileDocs = await reader.loadData(filePath, fs);
docs.push(...fileDocs);
// Observer can still cancel addition of the resulting docs from this file
if (this.doObserverCheck("file", filePath, ReaderStatus.COMPLETE)) {
docs.push(...fileDocs);
}
} catch (e) {
console.error(`Error reading file ${filePath}: ${e}`);
const msg = `Error reading file ${filePath}: ${e}`;
console.error(msg);
// In an error condition, observer's false cancels the whole process.
if (!this.doObserverCheck("file", filePath, ReaderStatus.ERROR, msg)) {
return [];
}
}
}
// After successful import of all files, directory completion
// is only a notification for observer, cannot be cancelled.
this.doObserverCheck("directory", directoryPath, ReaderStatus.COMPLETE);
return docs;
}
private doObserverCheck(
category: "file" | "directory",
name: string,
status: ReaderStatus,
message?: string,
): boolean {
if (this.observer) {
return this.observer(category, name, status, message);
}
return true;
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment