Skip to content
Snippets Groups Projects
Commit 1bcf7604 authored by Sourabh Desai's avatar Sourabh Desai
Browse files

start of basic pdf & directory readers

parent 5f9f8137
No related branches found
No related tags found
No related merge requests found
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
"js-tiktoken": "^1.0.7", "js-tiktoken": "^1.0.7",
"lodash": "^4.17.21", "lodash": "^4.17.21",
"openai": "^3.3.0", "openai": "^3.3.0",
"pdf-parse": "^1.1.1",
"tiktoken-node": "^0.0.6", "tiktoken-node": "^0.0.6",
"uuid": "^9.0.0", "uuid": "^9.0.0",
"wink-nlp": "^1.14.1" "wink-nlp": "^1.14.1"
...@@ -18,6 +19,7 @@ ...@@ -18,6 +19,7 @@
"devDependencies": { "devDependencies": {
"@types/lodash": "^4.14.195", "@types/lodash": "^4.14.195",
"@types/node": "^18", "@types/node": "^18",
"@types/pdf-parse": "^1.1.1",
"@types/uuid": "^9.0.2", "@types/uuid": "^9.0.2",
"node-stdlib-browser": "^1.2.0" "node-stdlib-browser": "^1.2.0"
} }
......
import { Document } from "../Document";
import { BaseReader } from "./base";
import { GenericFileSystem } from "../storage/FileSystem";
import { DEFAULT_FS } from "../storage/constants";
import { default as pdfParse } from "pdf-parse";
import _ from "lodash";
export class PDFReader implements BaseReader {
async loadData(
file: string,
fs: GenericFileSystem = DEFAULT_FS
): Promise<Document> {
let dataBuffer = (await fs.readFile(file)) as any;
const data = await pdfParse(dataBuffer);
return new Document(data.text, file);
}
}
import { Document } from "../Document";
import { BaseReader } from "./base";
import { CompleteFileSystem, walk } from "../storage/FileSystem";
import { DEFAULT_FS } from "../storage/constants";
export default class SimpleDirectoryReader implements BaseReader {
async loadData(
directoryPath: string,
fs: CompleteFileSystem = DEFAULT_FS as CompleteFileSystem
): Promise<Document[]> {
const docs: Document[] = [];
for await (const filePath of walk(fs, directoryPath)) {
try {
const fileData = await fs.readFile(filePath);
docs.push(new Document(fileData, directoryPath));
} catch (e) {
console.error(`Error reading file ${filePath}: ${e}`);
}
}
return docs;
}
}
import { Document } from "../Document";
export interface BaseReader {
loadData(...args: any[]): Promise<Document[]>;
}
...@@ -9,10 +9,15 @@ import _ from "lodash"; ...@@ -9,10 +9,15 @@ import _ from "lodash";
export interface GenericFileSystem { export interface GenericFileSystem {
writeFile(path: string, content: string, options?: any): Promise<void>; writeFile(path: string, content: string, options?: any): Promise<void>;
readFile(path: string, options?: any): Promise<string>; readFile(path: string, options?: any): Promise<string>;
exists(path: string): Promise<boolean>; access(path: string): Promise<boolean>;
mkdir(path: string, options?: any): Promise<void>; mkdir(path: string, options?: any): Promise<void>;
} }
export interface WalkableFileSystem {
readdir(path: string): Promise<string[]>;
stat(path: string): Promise<any>;
}
/** /**
* A filesystem implementation that stores files in memory. * A filesystem implementation that stores files in memory.
*/ */
...@@ -30,7 +35,7 @@ export class InMemoryFileSystem implements GenericFileSystem { ...@@ -30,7 +35,7 @@ export class InMemoryFileSystem implements GenericFileSystem {
return _.cloneDeep(this.files[path]); return _.cloneDeep(this.files[path]);
} }
async exists(path: string): Promise<boolean> { async access(path: string): Promise<boolean> {
return path in this.files; return path in this.files;
} }
...@@ -39,19 +44,11 @@ export class InMemoryFileSystem implements GenericFileSystem { ...@@ -39,19 +44,11 @@ export class InMemoryFileSystem implements GenericFileSystem {
} }
} }
export function getNodeFS(): GenericFileSystem { export type CompleteFileSystem = GenericFileSystem & WalkableFileSystem;
export function getNodeFS(): CompleteFileSystem {
const fs = require("fs/promises"); const fs = require("fs/promises");
return { return fs;
exists: async (path: string) => {
try {
await fs.access(path);
return true;
} catch {
return false;
}
},
...fs,
};
} }
let fs = null; let fs = null;
...@@ -60,4 +57,48 @@ try { ...@@ -60,4 +57,48 @@ try {
} catch (e) { } catch (e) {
fs = new InMemoryFileSystem(); fs = new InMemoryFileSystem();
} }
export const DEFAULT_FS = fs as GenericFileSystem; export const DEFAULT_FS: GenericFileSystem | CompleteFileSystem =
fs as GenericFileSystem;
// FS utility functions
/**
* Checks if a file exists.
* Analogous to the os.path.exists function from Python.
* @param fs The filesystem to use.
* @param path The path to the file to check.
* @returns A promise that resolves to true if the file exists, false otherwise.
*/
export async function exists(
fs: GenericFileSystem,
path: string
): Promise<boolean> {
try {
await fs.access(path);
return true;
} catch {
return false;
}
}
export async function* walk(
fs: WalkableFileSystem,
dirPath: string
): AsyncIterable<string> {
if (fs instanceof InMemoryFileSystem) {
throw new Error(
"The InMemoryFileSystem does not support directory traversal."
);
}
const entries = await fs.readdir(dirPath);
for (const entry of entries) {
const fullPath = `${dirPath}/${entry}`;
const stats = await fs.stat(fullPath);
if (stats.isDirectory()) {
yield* walk(fs, fullPath);
} else {
yield fullPath;
}
}
}
import * as path from "path"; import * as path from "path";
import { GenericFileSystem } from "../FileSystem"; import { GenericFileSystem, exists } from "../FileSystem";
import { DEFAULT_COLLECTION, DEFAULT_FS } from "../constants"; import { DEFAULT_COLLECTION, DEFAULT_FS } from "../constants";
import * as _ from "lodash"; import * as _ from "lodash";
import { BaseKVStore } from "./types"; import { BaseKVStore } from "./types";
...@@ -60,7 +60,7 @@ export class SimpleKVStore extends BaseKVStore { ...@@ -60,7 +60,7 @@ export class SimpleKVStore extends BaseKVStore {
fs = fs || DEFAULT_FS; fs = fs || DEFAULT_FS;
// TODO: decide on a way to polyfill path // TODO: decide on a way to polyfill path
let dirPath = path.dirname(persistPath); let dirPath = path.dirname(persistPath);
if (!(await fs.exists(dirPath))) { if (!(await exists(fs, dirPath))) {
await fs.mkdir(dirPath); await fs.mkdir(dirPath);
} }
await fs.writeFile(persistPath, JSON.stringify(this.data)); await fs.writeFile(persistPath, JSON.stringify(this.data));
......
import _ from "lodash"; import _ from "lodash";
import { GenericFileSystem } from "../FileSystem"; import { GenericFileSystem, exists } from "../FileSystem";
import { import {
NodeWithEmbedding, NodeWithEmbedding,
VectorStore, VectorStore,
...@@ -134,7 +134,7 @@ export class SimpleVectorStore implements VectorStore { ...@@ -134,7 +134,7 @@ export class SimpleVectorStore implements VectorStore {
fs?: GenericFileSystem fs?: GenericFileSystem
): Promise<void> { ): Promise<void> {
fs = fs || this.fs; fs = fs || this.fs;
if (!(await fs.exists(persistPath))) { if (!(await exists(fs, persistPath))) {
await fs.mkdir(persistPath); await fs.mkdir(persistPath);
} }
...@@ -146,7 +146,7 @@ export class SimpleVectorStore implements VectorStore { ...@@ -146,7 +146,7 @@ export class SimpleVectorStore implements VectorStore {
fs?: GenericFileSystem fs?: GenericFileSystem
): Promise<SimpleVectorStore> { ): Promise<SimpleVectorStore> {
fs = fs || DEFAULT_FS; fs = fs || DEFAULT_FS;
if (!(await fs.exists(persistPath))) { if (!(await exists(fs, persistPath))) {
throw new Error( throw new Error(
`No existing SimpleVectorStore found at ${persistPath}, skipping load.` `No existing SimpleVectorStore found at ${persistPath}, skipping load.`
); );
......
...@@ -2,6 +2,7 @@ import { ...@@ -2,6 +2,7 @@ import {
GenericFileSystem, GenericFileSystem,
getNodeFS, getNodeFS,
InMemoryFileSystem, InMemoryFileSystem,
exists,
} from "../storage/FileSystem"; } from "../storage/FileSystem";
import os from "os"; import os from "os";
import path from "path"; import path from "path";
...@@ -83,18 +84,18 @@ describe.each<FileSystemUnderTest>([ ...@@ -83,18 +84,18 @@ describe.each<FileSystemUnderTest>([
describe("exists", () => { describe("exists", () => {
it("returns true for existing file", async () => { it("returns true for existing file", async () => {
await testFS.writeFile(`${tempDir}/test.txt`, "Hello, world!"); await testFS.writeFile(`${tempDir}/test.txt`, "Hello, world!");
expect(await testFS.exists(`${tempDir}/test.txt`)).toBe(true); expect(await exists(testFS, `${tempDir}/test.txt`)).toBe(true);
}); });
it("returns false for non-existing file", async () => { it("returns false for non-existing file", async () => {
expect(await testFS.exists(`${tempDir}/not_exist.txt`)).toBe(false); expect(await exists(testFS, `${tempDir}/not_exist.txt`)).toBe(false);
}); });
}); });
describe("mkdir", () => { describe("mkdir", () => {
it("creates directory if it doesn't exist", async () => { it("creates directory if it doesn't exist", async () => {
await testFS.mkdir(`${tempDir}/testDir`); await testFS.mkdir(`${tempDir}/testDir`);
expect(await testFS.exists(`${tempDir}/testDir`)).toBe(true); expect(await exists(testFS, `${tempDir}/testDir`)).toBe(true);
}); });
}); });
}); });
...@@ -131,6 +131,9 @@ importers: ...@@ -131,6 +131,9 @@ importers:
openai: openai:
specifier: ^3.3.0 specifier: ^3.3.0
version: 3.3.0 version: 3.3.0
pdf-parse:
specifier: ^1.1.1
version: 1.1.1
tiktoken-node: tiktoken-node:
specifier: ^0.0.6 specifier: ^0.0.6
version: 0.0.6 version: 0.0.6
...@@ -147,6 +150,9 @@ importers: ...@@ -147,6 +150,9 @@ importers:
'@types/node': '@types/node':
specifier: ^18 specifier: ^18
version: 18.6.0 version: 18.6.0
'@types/pdf-parse':
specifier: ^1.1.1
version: 1.1.1
'@types/uuid': '@types/uuid':
specifier: ^9.0.2 specifier: ^9.0.2
version: 9.0.2 version: 9.0.2
...@@ -1121,6 +1127,10 @@ packages: ...@@ -1121,6 +1127,10 @@ packages:
resolution: {integrity: sha512-EhcH/wvidPy1WeML3TtYFGR83UzjxeWRen9V402T8aUGYsCHOmfoisV3ZSg03gAFIbLq8TnWOJ0f4cALtnSEUg==} resolution: {integrity: sha512-EhcH/wvidPy1WeML3TtYFGR83UzjxeWRen9V402T8aUGYsCHOmfoisV3ZSg03gAFIbLq8TnWOJ0f4cALtnSEUg==}
dev: true dev: true
/@types/pdf-parse@1.1.1:
resolution: {integrity: sha512-lDBKAslCwvfK2uvS1Uk+UCpGvw+JRy5vnBFANPKFSY92n/iEnunXi0KVBjPJXhsM4jtdcPnS7tuZ0zjA9x6piQ==}
dev: true
/@types/prettier@2.7.3: /@types/prettier@2.7.3:
resolution: {integrity: sha512-+68kP9yzs4LMp7VNh8gdzMSPZFL44MLGqiHWvttYJe+6qnuVr4Ek9wSBQoveqY/r+LwjCcU29kNVkidwim+kYA==} resolution: {integrity: sha512-+68kP9yzs4LMp7VNh8gdzMSPZFL44MLGqiHWvttYJe+6qnuVr4Ek9wSBQoveqY/r+LwjCcU29kNVkidwim+kYA==}
dev: true dev: true
...@@ -4229,6 +4239,10 @@ packages: ...@@ -4229,6 +4239,10 @@ packages:
lower-case: 1.1.4 lower-case: 1.1.4
dev: true dev: true
/node-ensure@0.0.0:
resolution: {integrity: sha512-DRI60hzo2oKN1ma0ckc6nQWlHU69RH6xN0sjQTjMpChPfTYvKZdcQFfdYK2RWbJcKyUizSIy/l8OTGxMAM1QDw==}
dev: false
/node-int64@0.4.0: /node-int64@0.4.0:
resolution: {integrity: sha512-O5lz91xSOeoXP6DulyHfllpq+Eg00MWitZIbtPfoSEvqIHdl5gfcY6hYzDWnj0qD5tz52PI08u9qUvSVeUBeHw==} resolution: {integrity: sha512-O5lz91xSOeoXP6DulyHfllpq+Eg00MWitZIbtPfoSEvqIHdl5gfcY6hYzDWnj0qD5tz52PI08u9qUvSVeUBeHw==}
dev: true dev: true
...@@ -4567,6 +4581,16 @@ packages: ...@@ -4567,6 +4581,16 @@ packages:
sha.js: 2.4.11 sha.js: 2.4.11
dev: true dev: true
/pdf-parse@1.1.1:
resolution: {integrity: sha512-v6ZJ/efsBpGrGGknjtq9J/oC8tZWq0KWL5vQrk2GlzLEQPUDB1ex+13Rmidl1neNN358Jn9EHZw5y07FFtaC7A==}
engines: {node: '>=6.8.1'}
dependencies:
debug: 3.2.7
node-ensure: 0.0.0
transitivePeerDependencies:
- supports-color
dev: false
/picocolors@1.0.0: /picocolors@1.0.0:
resolution: {integrity: sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==} resolution: {integrity: sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment