diff --git a/packages/core/package.json b/packages/core/package.json index 481a1d8e11c03fb119bb6e4483c6af82b32bc995..b303d036048822c443c8fe92241dd8c3f6a79e97 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -5,6 +5,7 @@ "js-tiktoken": "^1.0.7", "lodash": "^4.17.21", "openai": "^3.3.0", + "pdf-parse": "^1.1.1", "tiktoken-node": "^0.0.6", "uuid": "^9.0.0", "wink-nlp": "^1.14.1" @@ -18,6 +19,7 @@ "devDependencies": { "@types/lodash": "^4.14.195", "@types/node": "^18", + "@types/pdf-parse": "^1.1.1", "@types/uuid": "^9.0.2", "node-stdlib-browser": "^1.2.0" } diff --git a/packages/core/src/readers/PDFReader.ts b/packages/core/src/readers/PDFReader.ts new file mode 100644 index 0000000000000000000000000000000000000000..d2fb778c6e7f5cce92560078f48fd35f8bc46c89 --- /dev/null +++ b/packages/core/src/readers/PDFReader.ts @@ -0,0 +1,17 @@ +import { Document } from "../Document"; +import { BaseReader } from "./base"; +import { GenericFileSystem } from "../storage/FileSystem"; +import { DEFAULT_FS } from "../storage/constants"; +import { default as pdfParse } from "pdf-parse"; +import _ from "lodash"; + +export class PDFReader implements BaseReader { + async loadData( + file: string, + fs: GenericFileSystem = DEFAULT_FS + ): Promise<Document> { + let dataBuffer = (await fs.readFile(file)) as any; + const data = await pdfParse(dataBuffer); + return new Document(data.text, file); + } +} diff --git a/packages/core/src/readers/SimpleDirectoryReader.ts b/packages/core/src/readers/SimpleDirectoryReader.ts new file mode 100644 index 0000000000000000000000000000000000000000..0a467469e65af5409e05cb18f287d4fabeadcc46 --- /dev/null +++ b/packages/core/src/readers/SimpleDirectoryReader.ts @@ -0,0 +1,22 @@ +import { Document } from "../Document"; +import { BaseReader } from "./base"; +import { CompleteFileSystem, walk } from "../storage/FileSystem"; +import { DEFAULT_FS } from "../storage/constants"; + +export default class SimpleDirectoryReader implements BaseReader { + async loadData( + directoryPath: string, + fs: CompleteFileSystem = DEFAULT_FS as CompleteFileSystem + ): Promise<Document[]> { + const docs: Document[] = []; + for await (const filePath of walk(fs, directoryPath)) { + try { + const fileData = await fs.readFile(filePath); + docs.push(new Document(fileData, directoryPath)); + } catch (e) { + console.error(`Error reading file ${filePath}: ${e}`); + } + } + return docs; + } +} diff --git a/packages/core/src/readers/base.ts b/packages/core/src/readers/base.ts new file mode 100644 index 0000000000000000000000000000000000000000..87fc1d4f2f61187083b1671d2054ce70ac51a279 --- /dev/null +++ b/packages/core/src/readers/base.ts @@ -0,0 +1,5 @@ +import { Document } from "../Document"; + +export interface BaseReader { + loadData(...args: any[]): Promise<Document[]>; +} diff --git a/packages/core/src/storage/FileSystem.ts b/packages/core/src/storage/FileSystem.ts index 228233c53fbfca6e9bb724a88fd86d5e00b048b8..9ac5e15ec1146e4f56821702e32341e4556375ca 100644 --- a/packages/core/src/storage/FileSystem.ts +++ b/packages/core/src/storage/FileSystem.ts @@ -9,10 +9,15 @@ import _ from "lodash"; export interface GenericFileSystem { writeFile(path: string, content: string, options?: any): Promise<void>; readFile(path: string, options?: any): Promise<string>; - exists(path: string): Promise<boolean>; + access(path: string): Promise<boolean>; mkdir(path: string, options?: any): Promise<void>; } +export interface WalkableFileSystem { + readdir(path: string): Promise<string[]>; + stat(path: string): Promise<any>; +} + /** * A filesystem implementation that stores files in memory. */ @@ -30,7 +35,7 @@ export class InMemoryFileSystem implements GenericFileSystem { return _.cloneDeep(this.files[path]); } - async exists(path: string): Promise<boolean> { + async access(path: string): Promise<boolean> { return path in this.files; } @@ -39,19 +44,11 @@ export class InMemoryFileSystem implements GenericFileSystem { } } -export function getNodeFS(): GenericFileSystem { +export type CompleteFileSystem = GenericFileSystem & WalkableFileSystem; + +export function getNodeFS(): CompleteFileSystem { const fs = require("fs/promises"); - return { - exists: async (path: string) => { - try { - await fs.access(path); - return true; - } catch { - return false; - } - }, - ...fs, - }; + return fs; } let fs = null; @@ -60,4 +57,48 @@ try { } catch (e) { fs = new InMemoryFileSystem(); } -export const DEFAULT_FS = fs as GenericFileSystem; +export const DEFAULT_FS: GenericFileSystem | CompleteFileSystem = + fs as GenericFileSystem; + +// FS utility functions + +/** + * Checks if a file exists. + * Analogous to the os.path.exists function from Python. + * @param fs The filesystem to use. + * @param path The path to the file to check. + * @returns A promise that resolves to true if the file exists, false otherwise. + */ +export async function exists( + fs: GenericFileSystem, + path: string +): Promise<boolean> { + try { + await fs.access(path); + return true; + } catch { + return false; + } +} + +export async function* walk( + fs: WalkableFileSystem, + dirPath: string +): AsyncIterable<string> { + if (fs instanceof InMemoryFileSystem) { + throw new Error( + "The InMemoryFileSystem does not support directory traversal." + ); + } + + const entries = await fs.readdir(dirPath); + for (const entry of entries) { + const fullPath = `${dirPath}/${entry}`; + const stats = await fs.stat(fullPath); + if (stats.isDirectory()) { + yield* walk(fs, fullPath); + } else { + yield fullPath; + } + } +} diff --git a/packages/core/src/storage/kvStore/SimpleKVStore.ts b/packages/core/src/storage/kvStore/SimpleKVStore.ts index 4fb82a42ed8d56a1b48f345c3c1cf9eb8af09de4..196d7631a9d9d0fd7d9264d12cb6e1a9bd8f6b8a 100644 --- a/packages/core/src/storage/kvStore/SimpleKVStore.ts +++ b/packages/core/src/storage/kvStore/SimpleKVStore.ts @@ -1,5 +1,5 @@ import * as path from "path"; -import { GenericFileSystem } from "../FileSystem"; +import { GenericFileSystem, exists } from "../FileSystem"; import { DEFAULT_COLLECTION, DEFAULT_FS } from "../constants"; import * as _ from "lodash"; import { BaseKVStore } from "./types"; @@ -60,7 +60,7 @@ export class SimpleKVStore extends BaseKVStore { fs = fs || DEFAULT_FS; // TODO: decide on a way to polyfill path let dirPath = path.dirname(persistPath); - if (!(await fs.exists(dirPath))) { + if (!(await exists(fs, dirPath))) { await fs.mkdir(dirPath); } await fs.writeFile(persistPath, JSON.stringify(this.data)); diff --git a/packages/core/src/storage/vectorStore/SimpleVectorStore.ts b/packages/core/src/storage/vectorStore/SimpleVectorStore.ts index c0ca46eddc93a8ddd00bae78054809f028db4869..489b13022a07943d9eb9b39172d33e0cbef6656b 100644 --- a/packages/core/src/storage/vectorStore/SimpleVectorStore.ts +++ b/packages/core/src/storage/vectorStore/SimpleVectorStore.ts @@ -1,5 +1,5 @@ import _ from "lodash"; -import { GenericFileSystem } from "../FileSystem"; +import { GenericFileSystem, exists } from "../FileSystem"; import { NodeWithEmbedding, VectorStore, @@ -134,7 +134,7 @@ export class SimpleVectorStore implements VectorStore { fs?: GenericFileSystem ): Promise<void> { fs = fs || this.fs; - if (!(await fs.exists(persistPath))) { + if (!(await exists(fs, persistPath))) { await fs.mkdir(persistPath); } @@ -146,7 +146,7 @@ export class SimpleVectorStore implements VectorStore { fs?: GenericFileSystem ): Promise<SimpleVectorStore> { fs = fs || DEFAULT_FS; - if (!(await fs.exists(persistPath))) { + if (!(await exists(fs, persistPath))) { throw new Error( `No existing SimpleVectorStore found at ${persistPath}, skipping load.` ); diff --git a/packages/core/src/tests/InMemoryFileSystem.test.ts b/packages/core/src/tests/InMemoryFileSystem.test.ts index f8e59434dce368eebc59586990ca79131db7cb1a..d86d6c18efff30309aa76a0c862d3afd9941f330 100644 --- a/packages/core/src/tests/InMemoryFileSystem.test.ts +++ b/packages/core/src/tests/InMemoryFileSystem.test.ts @@ -2,6 +2,7 @@ import { GenericFileSystem, getNodeFS, InMemoryFileSystem, + exists, } from "../storage/FileSystem"; import os from "os"; import path from "path"; @@ -83,18 +84,18 @@ describe.each<FileSystemUnderTest>([ describe("exists", () => { it("returns true for existing file", async () => { await testFS.writeFile(`${tempDir}/test.txt`, "Hello, world!"); - expect(await testFS.exists(`${tempDir}/test.txt`)).toBe(true); + expect(await exists(testFS, `${tempDir}/test.txt`)).toBe(true); }); it("returns false for non-existing file", async () => { - expect(await testFS.exists(`${tempDir}/not_exist.txt`)).toBe(false); + expect(await exists(testFS, `${tempDir}/not_exist.txt`)).toBe(false); }); }); describe("mkdir", () => { it("creates directory if it doesn't exist", async () => { await testFS.mkdir(`${tempDir}/testDir`); - expect(await testFS.exists(`${tempDir}/testDir`)).toBe(true); + expect(await exists(testFS, `${tempDir}/testDir`)).toBe(true); }); }); }); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 28440f02def18f6f8fec23635414a920ad7199b6..a708c644bd89effee5267ca67ad070ecf6bfb6e0 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -131,6 +131,9 @@ importers: openai: specifier: ^3.3.0 version: 3.3.0 + pdf-parse: + specifier: ^1.1.1 + version: 1.1.1 tiktoken-node: specifier: ^0.0.6 version: 0.0.6 @@ -147,6 +150,9 @@ importers: '@types/node': specifier: ^18 version: 18.6.0 + '@types/pdf-parse': + specifier: ^1.1.1 + version: 1.1.1 '@types/uuid': specifier: ^9.0.2 version: 9.0.2 @@ -1121,6 +1127,10 @@ packages: resolution: {integrity: sha512-EhcH/wvidPy1WeML3TtYFGR83UzjxeWRen9V402T8aUGYsCHOmfoisV3ZSg03gAFIbLq8TnWOJ0f4cALtnSEUg==} dev: true + /@types/pdf-parse@1.1.1: + resolution: {integrity: sha512-lDBKAslCwvfK2uvS1Uk+UCpGvw+JRy5vnBFANPKFSY92n/iEnunXi0KVBjPJXhsM4jtdcPnS7tuZ0zjA9x6piQ==} + dev: true + /@types/prettier@2.7.3: resolution: {integrity: sha512-+68kP9yzs4LMp7VNh8gdzMSPZFL44MLGqiHWvttYJe+6qnuVr4Ek9wSBQoveqY/r+LwjCcU29kNVkidwim+kYA==} dev: true @@ -4229,6 +4239,10 @@ packages: lower-case: 1.1.4 dev: true + /node-ensure@0.0.0: + resolution: {integrity: sha512-DRI60hzo2oKN1ma0ckc6nQWlHU69RH6xN0sjQTjMpChPfTYvKZdcQFfdYK2RWbJcKyUizSIy/l8OTGxMAM1QDw==} + dev: false + /node-int64@0.4.0: resolution: {integrity: sha512-O5lz91xSOeoXP6DulyHfllpq+Eg00MWitZIbtPfoSEvqIHdl5gfcY6hYzDWnj0qD5tz52PI08u9qUvSVeUBeHw==} dev: true @@ -4567,6 +4581,16 @@ packages: sha.js: 2.4.11 dev: true + /pdf-parse@1.1.1: + resolution: {integrity: sha512-v6ZJ/efsBpGrGGknjtq9J/oC8tZWq0KWL5vQrk2GlzLEQPUDB1ex+13Rmidl1neNN358Jn9EHZw5y07FFtaC7A==} + engines: {node: '>=6.8.1'} + dependencies: + debug: 3.2.7 + node-ensure: 0.0.0 + transitivePeerDependencies: + - supports-color + dev: false + /picocolors@1.0.0: resolution: {integrity: sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==}