diff --git a/.gitignore b/.gitignore index 431de977e5f09b71b53361c817df8d78422fdf88..5e8442ca83d1e14009ac473a4a4aaa04f01abade 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ # dependencies node_modules .pnp +.pnpm-store .pnp.js # testing diff --git a/apps/simple/directory.ts b/apps/simple/directory.ts new file mode 100644 index 0000000000000000000000000000000000000000..cf71bc1c4893c98f34b6f38830051a693143e90d --- /dev/null +++ b/apps/simple/directory.ts @@ -0,0 +1,19 @@ +import { SimpleDirectoryReader } from "llamaindex"; + +function callback(category: string, name: string, status: any, message?: string): boolean { + console.log(category, name, status, message); + if (name.endsWith('.pdf')) { + console.log("I DON'T WANT PDF FILES!"); + return false; + } + return true; +} + +async function main() { + // Load page + const reader = new SimpleDirectoryReader(callback); + const params = { directoryPath: "./data"}; + await reader.loadData(params); +} + +main().catch(console.error); diff --git a/packages/core/src/readers/SimpleDirectoryReader.ts b/packages/core/src/readers/SimpleDirectoryReader.ts index 82bb2ba2c7ee8bd0241beac63399e08dc6ac7259..1abb1e125e8f7b323b0f810a2ab3e6da0bdccc36 100644 --- a/packages/core/src/readers/SimpleDirectoryReader.ts +++ b/packages/core/src/readers/SimpleDirectoryReader.ts @@ -9,6 +9,13 @@ import { HTMLReader } from "./HTMLReader"; import { MarkdownReader } from "./MarkdownReader"; import { PDFReader } from "./PDFReader"; +type ReaderCallback = (category: string, name: string, status: ReaderStatus, message?: string) => boolean; +enum ReaderStatus { + Started = 0, + Completed, + Error +} + /** * Read a .txt file */ @@ -22,7 +29,7 @@ export class TextFileReader implements BaseReader { } } -const FILE_EXT_TO_READER: Record<string, BaseReader> = { +export const FILE_EXT_TO_READER: Record<string, BaseReader> = { txt: new TextFileReader(), pdf: new PDFReader(), csv: new PapaCSVReader(), @@ -40,20 +47,40 @@ export type SimpleDirectoryReaderLoadDataProps = { }; /** - * Read all of the documents in a directory. Currently supports PDF and TXT files. + * Read all of the documents in a directory. + * By default, supports the list of file types + * in the FILE_EXIT_TO_READER map. */ export class SimpleDirectoryReader implements BaseReader { + constructor(private observer?: ReaderCallback) {} + async loadData({ directoryPath, fs = DEFAULT_FS as CompleteFileSystem, defaultReader = new TextFileReader(), fileExtToReader = FILE_EXT_TO_READER, }: SimpleDirectoryReaderLoadDataProps): Promise<Document[]> { + + // Observer can decide to skip the directory + if (this.doObserverCheck( + 'Directory', directoryPath, ReaderStatus.Started + ) == false) { + return Promise.reject('Cancelled'); + } + let docs: Document[] = []; for await (const filePath of walk(fs, directoryPath)) { try { const fileExt = _.last(filePath.split(".")) || ""; + // Observer can decide to skip each file + if (this.doObserverCheck( + 'File', filePath, ReaderStatus.Started + ) == false) { + // Skip this file + continue; + } + let reader = null; if (fileExt in fileExtToReader) { @@ -61,16 +88,57 @@ export class SimpleDirectoryReader implements BaseReader { } else if (!_.isNil(defaultReader)) { reader = defaultReader; } else { - console.warn(`No reader for file extension of ${filePath}`); + const msg = `No reader for file extension of ${filePath}`; + console.warn(msg); + + // In an error condition, observer's false cancels the whole process. + if (this.doObserverCheck( + 'File', filePath, ReaderStatus.Error, msg + ) == false) { + return this.getCancelled(); + } + continue; } const fileDocs = await reader.loadData(filePath, fs); - docs.push(...fileDocs); + + // Observer can still cancel addition of the resulting docs from this file + if (this.doObserverCheck( + 'File', filePath, ReaderStatus.Completed + )) { + docs.push(...fileDocs); + } } catch (e) { - console.error(`Error reading file ${filePath}: ${e}`); + const msg = `Error reading file ${filePath}: ${e}`; + console.error(msg); + + // In an error condition, observer's false cancels the whole process. + if (this.doObserverCheck( + 'File', filePath, ReaderStatus.Error, msg + ) == false) { + return this.getCancelled(); + } } } + + // After successful import of all files, directory completion + // is only a notification for observer, cannot be cancelled. + this.doObserverCheck( + 'Directory', directoryPath, ReaderStatus.Completed + ); + return docs; } + + private getCancelled() { + return Promise.reject('Cancelled'); + } + + private doObserverCheck(category: string, name: string, status: ReaderStatus, message?: string): boolean { + if (this.observer) { + return this.observer(category, name, status, message); + } + return true; + } }