From 1bcf76049d3be4c2ae502e3e35685f56aed8ed84 Mon Sep 17 00:00:00 2001 From: Sourabh Desai <sourabhdesai@gmail.com> Date: Tue, 27 Jun 2023 05:44:22 +0000 Subject: [PATCH] start of basic pdf & directory readers --- packages/core/package.json | 2 + packages/core/src/readers/PDFReader.ts | 17 +++++ .../core/src/readers/SimpleDirectoryReader.ts | 22 ++++++ packages/core/src/readers/base.ts | 5 ++ packages/core/src/storage/FileSystem.ts | 71 +++++++++++++++---- .../core/src/storage/kvStore/SimpleKVStore.ts | 4 +- .../storage/vectorStore/SimpleVectorStore.ts | 6 +- .../core/src/tests/InMemoryFileSystem.test.ts | 7 +- pnpm-lock.yaml | 24 +++++++ 9 files changed, 135 insertions(+), 23 deletions(-) create mode 100644 packages/core/src/readers/PDFReader.ts create mode 100644 packages/core/src/readers/SimpleDirectoryReader.ts create mode 100644 packages/core/src/readers/base.ts diff --git a/packages/core/package.json b/packages/core/package.json index 481a1d8e1..b303d0360 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -5,6 +5,7 @@ "js-tiktoken": "^1.0.7", "lodash": "^4.17.21", "openai": "^3.3.0", + "pdf-parse": "^1.1.1", "tiktoken-node": "^0.0.6", "uuid": "^9.0.0", "wink-nlp": "^1.14.1" @@ -18,6 +19,7 @@ "devDependencies": { "@types/lodash": "^4.14.195", "@types/node": "^18", + "@types/pdf-parse": "^1.1.1", "@types/uuid": "^9.0.2", "node-stdlib-browser": "^1.2.0" } diff --git a/packages/core/src/readers/PDFReader.ts b/packages/core/src/readers/PDFReader.ts new file mode 100644 index 000000000..d2fb778c6 --- /dev/null +++ b/packages/core/src/readers/PDFReader.ts @@ -0,0 +1,17 @@ +import { Document } from "../Document"; +import { BaseReader } from "./base"; +import { GenericFileSystem } from "../storage/FileSystem"; +import { DEFAULT_FS } from "../storage/constants"; +import { default as pdfParse } from "pdf-parse"; +import _ from "lodash"; + +export class PDFReader implements BaseReader { + async loadData( + file: string, + fs: GenericFileSystem = DEFAULT_FS + ): Promise<Document> { + let dataBuffer = (await fs.readFile(file)) as any; + const data = await pdfParse(dataBuffer); + return new Document(data.text, file); + } +} diff --git a/packages/core/src/readers/SimpleDirectoryReader.ts b/packages/core/src/readers/SimpleDirectoryReader.ts new file mode 100644 index 000000000..0a467469e --- /dev/null +++ b/packages/core/src/readers/SimpleDirectoryReader.ts @@ -0,0 +1,22 @@ +import { Document } from "../Document"; +import { BaseReader } from "./base"; +import { CompleteFileSystem, walk } from "../storage/FileSystem"; +import { DEFAULT_FS } from "../storage/constants"; + +export default class SimpleDirectoryReader implements BaseReader { + async loadData( + directoryPath: string, + fs: CompleteFileSystem = DEFAULT_FS as CompleteFileSystem + ): Promise<Document[]> { + const docs: Document[] = []; + for await (const filePath of walk(fs, directoryPath)) { + try { + const fileData = await fs.readFile(filePath); + docs.push(new Document(fileData, directoryPath)); + } catch (e) { + console.error(`Error reading file ${filePath}: ${e}`); + } + } + return docs; + } +} diff --git a/packages/core/src/readers/base.ts b/packages/core/src/readers/base.ts new file mode 100644 index 000000000..87fc1d4f2 --- /dev/null +++ b/packages/core/src/readers/base.ts @@ -0,0 +1,5 @@ +import { Document } from "../Document"; + +export interface BaseReader { + loadData(...args: any[]): Promise<Document[]>; +} diff --git a/packages/core/src/storage/FileSystem.ts b/packages/core/src/storage/FileSystem.ts index 228233c53..9ac5e15ec 100644 --- a/packages/core/src/storage/FileSystem.ts +++ b/packages/core/src/storage/FileSystem.ts @@ -9,10 +9,15 @@ import _ from "lodash"; export interface GenericFileSystem { writeFile(path: string, content: string, options?: any): Promise<void>; readFile(path: string, options?: any): Promise<string>; - exists(path: string): Promise<boolean>; + access(path: string): Promise<boolean>; mkdir(path: string, options?: any): Promise<void>; } +export interface WalkableFileSystem { + readdir(path: string): Promise<string[]>; + stat(path: string): Promise<any>; +} + /** * A filesystem implementation that stores files in memory. */ @@ -30,7 +35,7 @@ export class InMemoryFileSystem implements GenericFileSystem { return _.cloneDeep(this.files[path]); } - async exists(path: string): Promise<boolean> { + async access(path: string): Promise<boolean> { return path in this.files; } @@ -39,19 +44,11 @@ export class InMemoryFileSystem implements GenericFileSystem { } } -export function getNodeFS(): GenericFileSystem { +export type CompleteFileSystem = GenericFileSystem & WalkableFileSystem; + +export function getNodeFS(): CompleteFileSystem { const fs = require("fs/promises"); - return { - exists: async (path: string) => { - try { - await fs.access(path); - return true; - } catch { - return false; - } - }, - ...fs, - }; + return fs; } let fs = null; @@ -60,4 +57,48 @@ try { } catch (e) { fs = new InMemoryFileSystem(); } -export const DEFAULT_FS = fs as GenericFileSystem; +export const DEFAULT_FS: GenericFileSystem | CompleteFileSystem = + fs as GenericFileSystem; + +// FS utility functions + +/** + * Checks if a file exists. + * Analogous to the os.path.exists function from Python. + * @param fs The filesystem to use. + * @param path The path to the file to check. + * @returns A promise that resolves to true if the file exists, false otherwise. + */ +export async function exists( + fs: GenericFileSystem, + path: string +): Promise<boolean> { + try { + await fs.access(path); + return true; + } catch { + return false; + } +} + +export async function* walk( + fs: WalkableFileSystem, + dirPath: string +): AsyncIterable<string> { + if (fs instanceof InMemoryFileSystem) { + throw new Error( + "The InMemoryFileSystem does not support directory traversal." + ); + } + + const entries = await fs.readdir(dirPath); + for (const entry of entries) { + const fullPath = `${dirPath}/${entry}`; + const stats = await fs.stat(fullPath); + if (stats.isDirectory()) { + yield* walk(fs, fullPath); + } else { + yield fullPath; + } + } +} diff --git a/packages/core/src/storage/kvStore/SimpleKVStore.ts b/packages/core/src/storage/kvStore/SimpleKVStore.ts index 4fb82a42e..196d7631a 100644 --- a/packages/core/src/storage/kvStore/SimpleKVStore.ts +++ b/packages/core/src/storage/kvStore/SimpleKVStore.ts @@ -1,5 +1,5 @@ import * as path from "path"; -import { GenericFileSystem } from "../FileSystem"; +import { GenericFileSystem, exists } from "../FileSystem"; import { DEFAULT_COLLECTION, DEFAULT_FS } from "../constants"; import * as _ from "lodash"; import { BaseKVStore } from "./types"; @@ -60,7 +60,7 @@ export class SimpleKVStore extends BaseKVStore { fs = fs || DEFAULT_FS; // TODO: decide on a way to polyfill path let dirPath = path.dirname(persistPath); - if (!(await fs.exists(dirPath))) { + if (!(await exists(fs, dirPath))) { await fs.mkdir(dirPath); } await fs.writeFile(persistPath, JSON.stringify(this.data)); diff --git a/packages/core/src/storage/vectorStore/SimpleVectorStore.ts b/packages/core/src/storage/vectorStore/SimpleVectorStore.ts index c0ca46edd..489b13022 100644 --- a/packages/core/src/storage/vectorStore/SimpleVectorStore.ts +++ b/packages/core/src/storage/vectorStore/SimpleVectorStore.ts @@ -1,5 +1,5 @@ import _ from "lodash"; -import { GenericFileSystem } from "../FileSystem"; +import { GenericFileSystem, exists } from "../FileSystem"; import { NodeWithEmbedding, VectorStore, @@ -134,7 +134,7 @@ export class SimpleVectorStore implements VectorStore { fs?: GenericFileSystem ): Promise<void> { fs = fs || this.fs; - if (!(await fs.exists(persistPath))) { + if (!(await exists(fs, persistPath))) { await fs.mkdir(persistPath); } @@ -146,7 +146,7 @@ export class SimpleVectorStore implements VectorStore { fs?: GenericFileSystem ): Promise<SimpleVectorStore> { fs = fs || DEFAULT_FS; - if (!(await fs.exists(persistPath))) { + if (!(await exists(fs, persistPath))) { throw new Error( `No existing SimpleVectorStore found at ${persistPath}, skipping load.` ); diff --git a/packages/core/src/tests/InMemoryFileSystem.test.ts b/packages/core/src/tests/InMemoryFileSystem.test.ts index f8e59434d..d86d6c18e 100644 --- a/packages/core/src/tests/InMemoryFileSystem.test.ts +++ b/packages/core/src/tests/InMemoryFileSystem.test.ts @@ -2,6 +2,7 @@ import { GenericFileSystem, getNodeFS, InMemoryFileSystem, + exists, } from "../storage/FileSystem"; import os from "os"; import path from "path"; @@ -83,18 +84,18 @@ describe.each<FileSystemUnderTest>([ describe("exists", () => { it("returns true for existing file", async () => { await testFS.writeFile(`${tempDir}/test.txt`, "Hello, world!"); - expect(await testFS.exists(`${tempDir}/test.txt`)).toBe(true); + expect(await exists(testFS, `${tempDir}/test.txt`)).toBe(true); }); it("returns false for non-existing file", async () => { - expect(await testFS.exists(`${tempDir}/not_exist.txt`)).toBe(false); + expect(await exists(testFS, `${tempDir}/not_exist.txt`)).toBe(false); }); }); describe("mkdir", () => { it("creates directory if it doesn't exist", async () => { await testFS.mkdir(`${tempDir}/testDir`); - expect(await testFS.exists(`${tempDir}/testDir`)).toBe(true); + expect(await exists(testFS, `${tempDir}/testDir`)).toBe(true); }); }); }); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 28440f02d..a708c644b 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -131,6 +131,9 @@ importers: openai: specifier: ^3.3.0 version: 3.3.0 + pdf-parse: + specifier: ^1.1.1 + version: 1.1.1 tiktoken-node: specifier: ^0.0.6 version: 0.0.6 @@ -147,6 +150,9 @@ importers: '@types/node': specifier: ^18 version: 18.6.0 + '@types/pdf-parse': + specifier: ^1.1.1 + version: 1.1.1 '@types/uuid': specifier: ^9.0.2 version: 9.0.2 @@ -1121,6 +1127,10 @@ packages: resolution: {integrity: sha512-EhcH/wvidPy1WeML3TtYFGR83UzjxeWRen9V402T8aUGYsCHOmfoisV3ZSg03gAFIbLq8TnWOJ0f4cALtnSEUg==} dev: true + /@types/pdf-parse@1.1.1: + resolution: {integrity: sha512-lDBKAslCwvfK2uvS1Uk+UCpGvw+JRy5vnBFANPKFSY92n/iEnunXi0KVBjPJXhsM4jtdcPnS7tuZ0zjA9x6piQ==} + dev: true + /@types/prettier@2.7.3: resolution: {integrity: sha512-+68kP9yzs4LMp7VNh8gdzMSPZFL44MLGqiHWvttYJe+6qnuVr4Ek9wSBQoveqY/r+LwjCcU29kNVkidwim+kYA==} dev: true @@ -4229,6 +4239,10 @@ packages: lower-case: 1.1.4 dev: true + /node-ensure@0.0.0: + resolution: {integrity: sha512-DRI60hzo2oKN1ma0ckc6nQWlHU69RH6xN0sjQTjMpChPfTYvKZdcQFfdYK2RWbJcKyUizSIy/l8OTGxMAM1QDw==} + dev: false + /node-int64@0.4.0: resolution: {integrity: sha512-O5lz91xSOeoXP6DulyHfllpq+Eg00MWitZIbtPfoSEvqIHdl5gfcY6hYzDWnj0qD5tz52PI08u9qUvSVeUBeHw==} dev: true @@ -4567,6 +4581,16 @@ packages: sha.js: 2.4.11 dev: true + /pdf-parse@1.1.1: + resolution: {integrity: sha512-v6ZJ/efsBpGrGGknjtq9J/oC8tZWq0KWL5vQrk2GlzLEQPUDB1ex+13Rmidl1neNN358Jn9EHZw5y07FFtaC7A==} + engines: {node: '>=6.8.1'} + dependencies: + debug: 3.2.7 + node-ensure: 0.0.0 + transitivePeerDependencies: + - supports-color + dev: false + /picocolors@1.0.0: resolution: {integrity: sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==} -- GitLab