From a7edc4d225fc7772bd18c66bd32e85ee63bb60d0 Mon Sep 17 00:00:00 2001 From: Sourabh Desai <sourabhdesai@gmail.com> Date: Tue, 27 Jun 2023 06:51:56 +0000 Subject: [PATCH] update method signature --- packages/core/src/readers/PDFReader.ts | 6 +-- .../core/src/readers/SimpleDirectoryReader.ts | 49 +++++++++++++++++-- 2 files changed, 47 insertions(+), 8 deletions(-) diff --git a/packages/core/src/readers/PDFReader.ts b/packages/core/src/readers/PDFReader.ts index d2fb778c6..91b9f1379 100644 --- a/packages/core/src/readers/PDFReader.ts +++ b/packages/core/src/readers/PDFReader.ts @@ -5,13 +5,13 @@ import { DEFAULT_FS } from "../storage/constants"; import { default as pdfParse } from "pdf-parse"; import _ from "lodash"; -export class PDFReader implements BaseReader { +export default class PDFReader implements BaseReader { async loadData( file: string, fs: GenericFileSystem = DEFAULT_FS - ): Promise<Document> { + ): Promise<Document[]> { let dataBuffer = (await fs.readFile(file)) as any; const data = await pdfParse(dataBuffer); - return new Document(data.text, file); + return [new Document(data.text, file)]; } } diff --git a/packages/core/src/readers/SimpleDirectoryReader.ts b/packages/core/src/readers/SimpleDirectoryReader.ts index 0a467469e..d4b850489 100644 --- a/packages/core/src/readers/SimpleDirectoryReader.ts +++ b/packages/core/src/readers/SimpleDirectoryReader.ts @@ -1,18 +1,57 @@ +import _ from "lodash"; import { Document } from "../Document"; import { BaseReader } from "./base"; import { CompleteFileSystem, walk } from "../storage/FileSystem"; import { DEFAULT_FS } from "../storage/constants"; +import PDFReader from "./PDFReader"; -export default class SimpleDirectoryReader implements BaseReader { +export class TextFileReader implements BaseReader { async loadData( - directoryPath: string, + file: string, fs: CompleteFileSystem = DEFAULT_FS as CompleteFileSystem ): Promise<Document[]> { - const docs: Document[] = []; + const dataBuffer = await fs.readFile(file, "utf-8"); + return [new Document(dataBuffer, file)]; + } +} + +const FILE_EXT_TO_READER: { [key: string]: BaseReader } = { + txt: new TextFileReader(), + pdf: new PDFReader(), +}; + +export type SimpleDirectoryReaderLoadDataProps = { + directoryPath: string; + fs?: CompleteFileSystem; + defaultReader?: BaseReader | null; + fileExtToReader?: { [key: string]: BaseReader }; +}; + +export default class SimpleDirectoryReader implements BaseReader { + async loadData({ + directoryPath, + fs = DEFAULT_FS as CompleteFileSystem, + defaultReader = new TextFileReader(), + fileExtToReader = FILE_EXT_TO_READER, + }: SimpleDirectoryReaderLoadDataProps): Promise<Document[]> { + let docs: Document[] = []; for await (const filePath of walk(fs, directoryPath)) { try { - const fileData = await fs.readFile(filePath); - docs.push(new Document(fileData, directoryPath)); + const fileExt = _.last(filePath.split(".")) || ""; + + let reader = null; + + if (fileExt in fileExtToReader) { + reader = fileExtToReader[fileExt]; + } else if (!_.isNil(defaultReader)) { + reader = defaultReader; + } else { + console.warn(`No reader for file extension of ${filePath}`); + continue; + } + + const fileDocs = await reader.loadData(filePath, fs); + docs.push(...fileDocs); } catch (e) { console.error(`Error reading file ${filePath}: ${e}`); } -- GitLab