From 73819bf19d63d7a7169afc7a1628535bbdd10fd9 Mon Sep 17 00:00:00 2001 From: Marcus Schiesser <mail@marcusschiesser.de> Date: Thu, 6 Jun 2024 11:51:54 +0200 Subject: [PATCH] feat: Unify metadata and ID handling of documents, allow files to be read by `Buffer` --- .changeset/twenty-crabs-join.md | 5 ++++ packages/core/src/readers/CSVReader.ts | 11 ++++---- packages/core/src/readers/DocxReader.ts | 12 ++++----- packages/core/src/readers/HTMLReader.ts | 11 ++++---- packages/core/src/readers/ImageReader.ts | 12 ++++----- packages/core/src/readers/LlamaParseReader.ts | 25 ++++++++----------- packages/core/src/readers/MarkdownReader.ts | 15 ++++++----- packages/core/src/readers/PDFReader.ts | 13 ++++------ .../src/readers/SimpleDirectoryReader.edge.ts | 10 +------- packages/core/src/readers/TextFileReader.ts | 11 ++++---- packages/core/src/readers/type.ts | 23 ++++++++++++++--- 11 files changed, 74 insertions(+), 74 deletions(-) create mode 100644 .changeset/twenty-crabs-join.md diff --git a/.changeset/twenty-crabs-join.md b/.changeset/twenty-crabs-join.md new file mode 100644 index 000000000..95afc4bee --- /dev/null +++ b/.changeset/twenty-crabs-join.md @@ -0,0 +1,5 @@ +--- +"llamaindex": patch +--- + +Unify metadata and ID handling of documents, allow files to be read by `Buffer` diff --git a/packages/core/src/readers/CSVReader.ts b/packages/core/src/readers/CSVReader.ts index 4844b5f1d..39f0abc4d 100644 --- a/packages/core/src/readers/CSVReader.ts +++ b/packages/core/src/readers/CSVReader.ts @@ -1,15 +1,14 @@ -import { fs } from "@llamaindex/env"; import type { ParseConfig } from "papaparse"; import Papa from "papaparse"; import { Document } from "../Node.js"; -import type { FileReader } from "./type.js"; +import { FileReader } from "./type.js"; /** * papaparse-based csv parser * @class CSVReader * @implements BaseReader */ -export class PapaCSVReader implements FileReader { +export class PapaCSVReader extends FileReader { private concatRows: boolean; private colJoiner: string; private rowJoiner: string; @@ -27,6 +26,7 @@ export class PapaCSVReader implements FileReader { rowJoiner: string = "\n", papaConfig?: ParseConfig, ) { + super(); this.concatRows = concatRows; this.colJoiner = colJoiner; this.rowJoiner = rowJoiner; @@ -39,9 +39,8 @@ export class PapaCSVReader implements FileReader { * @param {GenericFileSystem} [fs=DEFAULT_FS] - The file system to use for reading the file. * @returns {Promise<Document[]>} */ - async loadData(file: string): Promise<Document[]> { - const fileContent = await fs.readFile(file, "utf-8"); - const result = Papa.parse(fileContent, this.papaConfig); + async loadDataAsContent(fileContent: Buffer): Promise<Document[]> { + const result = Papa.parse(fileContent.toString("utf-8"), this.papaConfig); const textList = result.data.map((row: any) => { // Compatible with header row mode const rowValues = Object.values(row).map((value) => String(value)); diff --git a/packages/core/src/readers/DocxReader.ts b/packages/core/src/readers/DocxReader.ts index 0f79aa90a..1d4357b44 100644 --- a/packages/core/src/readers/DocxReader.ts +++ b/packages/core/src/readers/DocxReader.ts @@ -1,13 +1,11 @@ -import { fs } from "@llamaindex/env"; import mammoth from "mammoth"; import { Document } from "../Node.js"; -import type { FileReader } from "./type.js"; +import { FileReader } from "./type.js"; -export class DocxReader implements FileReader { +export class DocxReader extends FileReader { /** DocxParser */ - async loadData(file: string): Promise<Document[]> { - const dataBuffer = await fs.readFile(file); - const { value } = await mammoth.extractRawText({ buffer: dataBuffer }); - return [new Document({ text: value, id_: file })]; + async loadDataAsContent(fileContent: Buffer): Promise<Document[]> { + const { value } = await mammoth.extractRawText({ buffer: fileContent }); + return [new Document({ text: value })]; } } diff --git a/packages/core/src/readers/HTMLReader.ts b/packages/core/src/readers/HTMLReader.ts index 5b16c5556..6927444f5 100644 --- a/packages/core/src/readers/HTMLReader.ts +++ b/packages/core/src/readers/HTMLReader.ts @@ -1,6 +1,5 @@ -import { fs } from "@llamaindex/env"; import { Document } from "../Node.js"; -import type { FileReader } from "./type.js"; +import { FileReader } from "./type.js"; /** * Extract the significant text from an arbitrary HTML document. @@ -9,18 +8,18 @@ import type { FileReader } from "./type.js"; * All other tags are removed, and the inner text is kept intact. * Html entities (e.g., &) are not decoded. */ -export class HTMLReader implements FileReader { +export class HTMLReader extends FileReader { /** * Public method for this reader. * Required by BaseReader interface. * @param file Path/name of the file to be loaded. * @returns Promise<Document[]> A Promise object, eventually yielding zero or one Document parsed from the HTML content of the specified file. */ - async loadData(file: string): Promise<Document[]> { - const dataBuffer = await fs.readFile(file, "utf-8"); + async loadDataAsContent(fileContent: Buffer): Promise<Document[]> { + const dataBuffer = fileContent.toString("utf-8"); const htmlOptions = this.getOptions(); const content = await this.parseContent(dataBuffer, htmlOptions); - return [new Document({ text: content, id_: file })]; + return [new Document({ text: content })]; } /** diff --git a/packages/core/src/readers/ImageReader.ts b/packages/core/src/readers/ImageReader.ts index 6a3d70b4c..0414e2199 100644 --- a/packages/core/src/readers/ImageReader.ts +++ b/packages/core/src/readers/ImageReader.ts @@ -1,12 +1,11 @@ -import { fs } from "@llamaindex/env"; import type { Document } from "../Node.js"; import { ImageDocument } from "../Node.js"; -import type { FileReader } from "./type.js"; +import { FileReader } from "./type.js"; /** * Reads the content of an image file into a Document object (which stores the image file as a Blob). */ -export class ImageReader implements FileReader { +export class ImageReader extends FileReader { /** * Public method for this reader. * Required by BaseReader interface. @@ -14,9 +13,8 @@ export class ImageReader implements FileReader { * @param fs fs wrapper interface for getting the file content. * @returns Promise<Document[]> A Promise object, eventually yielding zero or one ImageDocument of the specified file. */ - async loadData(file: string): Promise<Document[]> { - const dataBuffer = await fs.readFile(file); - const blob = new Blob([dataBuffer]); - return [new ImageDocument({ image: blob, id_: file })]; + async loadDataAsContent(fileContent: Buffer): Promise<Document[]> { + const blob = new Blob([fileContent]); + return [new ImageDocument({ image: blob })]; } } diff --git a/packages/core/src/readers/LlamaParseReader.ts b/packages/core/src/readers/LlamaParseReader.ts index f292e4609..dc85b6457 100644 --- a/packages/core/src/readers/LlamaParseReader.ts +++ b/packages/core/src/readers/LlamaParseReader.ts @@ -1,7 +1,7 @@ import { fs, getEnv } from "@llamaindex/env"; import { filetypemime } from "magic-bytes.js"; import { Document } from "../Node.js"; -import type { FileReader, Language, ResultType } from "./type.js"; +import { FileReader, type Language, type ResultType } from "./type.js"; const SupportedFiles: { [key: string]: string } = { ".pdf": "application/pdf", @@ -105,7 +105,7 @@ const SupportedFiles: { [key: string]: string } = { * Represents a reader for parsing files using the LlamaParse API. * See https://github.com/run-llama/llama_parse */ -export class LlamaParseReader implements FileReader { +export class LlamaParseReader extends FileReader { // The API key for the LlamaParse API. Can be set as an environment variable: LLAMA_CLOUD_API_KEY apiKey: string; // The base URL of the Llama Parsing API. @@ -133,6 +133,7 @@ export class LlamaParseReader implements FileReader { // numWorkers is implemented in SimpleDirectoryReader constructor(params: Partial<LlamaParseReader> = {}) { + super(); Object.assign(this, params); params.apiKey = params.apiKey ?? getEnv("LLAMA_CLOUD_API_KEY"); if (!params.apiKey) { @@ -151,17 +152,16 @@ export class LlamaParseReader implements FileReader { } // Create a job for the LlamaParse API - private async createJob(file: string): Promise<string> { + private async createJob(data: Buffer): Promise<string> { // Load data, set the mime type - const data = await fs.readFile(file); const mimeType = await this.getMimeType(data); if (this.verbose) { - console.log(`Starting load for file: ${file}`); + console.log(`Starting load for file with mimeType: ${mimeType}`); } const body = new FormData(); - body.set("file", new Blob([data], { type: mimeType }), file); + body.set("file", new Blob([data], { type: mimeType })); body.append("language", this.language); body.append("parsing_instruction", this.parsingInstruction); body.append("skip_diagonal_text", this.skipDiagonalText.toString()); @@ -251,15 +251,12 @@ export class LlamaParseReader implements FileReader { * Loads data from a file and returns an array of Document objects. * To be used with resultType = "text" and "markdown" * - * @param {string} file - The path to the file to be loaded. + * @param {Buffer} fileContent - The content of the file to be loaded. * @return {Promise<Document[]>} A Promise object that resolves to an array of Document objects. */ - async loadData(file: string): Promise<Document[]> { - // Set metadata to contain file_path - const metadata = { file_path: file }; - + async loadDataAsContent(fileContent: Buffer): Promise<Document[]> { // Creates a job for the file - const jobId = await this.createJob(file); + const jobId = await this.createJob(fileContent); if (this.verbose) { console.log(`Started parsing the file under job id ${jobId}`); } @@ -269,7 +266,6 @@ export class LlamaParseReader implements FileReader { return [ new Document({ text: resultJson[this.resultType], - metadata: metadata, }), ]; } @@ -281,8 +277,9 @@ export class LlamaParseReader implements FileReader { * @return {Promise<Record<string, any>>} A Promise that resolves to the JSON object. */ async loadJson(file: string): Promise<Record<string, any>> { + const data = await fs.readFile(file); // Creates a job for the file - const jobId = await this.createJob(file); + const jobId = await this.createJob(data); if (this.verbose) { console.log(`Started parsing the file under job id ${jobId}`); } diff --git a/packages/core/src/readers/MarkdownReader.ts b/packages/core/src/readers/MarkdownReader.ts index 47a1ecc44..940fd4dda 100644 --- a/packages/core/src/readers/MarkdownReader.ts +++ b/packages/core/src/readers/MarkdownReader.ts @@ -1,6 +1,5 @@ -import { fs } from "@llamaindex/env"; import { Document } from "../Node.js"; -import type { FileReader } from "./type.js"; +import { FileReader } from "./type.js"; type MarkdownTuple = [string | null, string]; @@ -8,7 +7,7 @@ type MarkdownTuple = [string | null, string]; * Extract text from markdown files. * Returns dictionary with keys as headers and values as the text between headers. */ -export class MarkdownReader implements FileReader { +export class MarkdownReader extends FileReader { private _removeHyperlinks: boolean; private _removeImages: boolean; @@ -17,6 +16,7 @@ export class MarkdownReader implements FileReader { * @param {boolean} [removeImages=true] - Indicates whether images should be removed. */ constructor(removeHyperlinks: boolean = true, removeImages: boolean = true) { + super(); this._removeHyperlinks = removeHyperlinks; this._removeImages = removeImages; } @@ -89,18 +89,17 @@ export class MarkdownReader implements FileReader { return this.markdownToTups(modifiedContent); } - async loadData(file: string): Promise<Document[]> { - const content = await fs.readFile(file, "utf-8"); + async loadDataAsContent(fileContent: Buffer): Promise<Document[]> { + const content = fileContent.toString("utf-8"); const tups = this.parseTups(content); const results: Document[] = []; let counter = 0; for (const [header, value] of tups) { - const id_ = `${file}_${counter}`; if (header) { const text = `\n\n${header}\n${value}`; - results.push(new Document({ text, id_ })); + results.push(new Document({ text })); } else { - results.push(new Document({ text: value, id_ })); + results.push(new Document({ text: value })); } counter += 1; } diff --git a/packages/core/src/readers/PDFReader.ts b/packages/core/src/readers/PDFReader.ts index bda8ab753..50ff9df7d 100644 --- a/packages/core/src/readers/PDFReader.ts +++ b/packages/core/src/readers/PDFReader.ts @@ -1,20 +1,17 @@ -import { fs } from "@llamaindex/env"; import { Document } from "../Node.js"; -import type { FileReader } from "./type.js"; +import { FileReader } from "./type.js"; /** * Read the text of a PDF */ -export class PDFReader implements FileReader { - async loadData(file: string): Promise<Document[]> { - const content = await fs.readFile(file); - const pages = await readPDF(content); +export class PDFReader extends FileReader { + async loadDataAsContent(fileContent: Buffer): Promise<Document[]> { + const pages = await readPDF(fileContent); return pages.map((text, page) => { - const id_ = `${file}_${page + 1}`; const metadata = { page_number: page + 1, }; - return new Document({ text, id_, metadata }); + return new Document({ text, metadata }); }); } } diff --git a/packages/core/src/readers/SimpleDirectoryReader.edge.ts b/packages/core/src/readers/SimpleDirectoryReader.edge.ts index 589f45585..5c794d561 100644 --- a/packages/core/src/readers/SimpleDirectoryReader.edge.ts +++ b/packages/core/src/readers/SimpleDirectoryReader.edge.ts @@ -1,5 +1,5 @@ import { path } from "@llamaindex/env"; -import { Document, type Metadata } from "../Node.js"; +import { Document } from "../Node.js"; import { walk } from "../storage/FileSystem.js"; import { TextFileReader } from "./TextFileReader.js"; import type { BaseReader, FileReader } from "./type.js"; @@ -136,7 +136,6 @@ export class SimpleDirectoryReader implements BaseReader { } const fileDocs = await reader.loadData(filePath); - fileDocs.forEach(addMetaData(filePath)); // Observer can still cancel addition of the resulting docs from this file if (this.doObserverCheck("file", filePath, ReaderStatus.COMPLETE)) { @@ -167,10 +166,3 @@ export class SimpleDirectoryReader implements BaseReader { return true; } } - -function addMetaData(filePath: string): (doc: Document<Metadata>) => void { - return (doc: Document<Metadata>) => { - doc.metadata["file_path"] = path.resolve(filePath); - doc.metadata["file_name"] = path.basename(filePath); - }; -} diff --git a/packages/core/src/readers/TextFileReader.ts b/packages/core/src/readers/TextFileReader.ts index 0b91e69bf..1b575eb7e 100644 --- a/packages/core/src/readers/TextFileReader.ts +++ b/packages/core/src/readers/TextFileReader.ts @@ -1,14 +1,13 @@ -import { fs } from "@llamaindex/env"; import { Document } from "../Node.js"; -import type { FileReader } from "./type.js"; +import { FileReader } from "./type.js"; /** * Read a .txt file */ -export class TextFileReader implements FileReader { - async loadData(file: string): Promise<Document[]> { - const dataBuffer = await fs.readFile(file, "utf-8"); - return [new Document({ text: dataBuffer, id_: file })]; +export class TextFileReader extends FileReader { + async loadDataAsContent(fileContent: Buffer): Promise<Document[]> { + const dataBuffer = fileContent.toString("utf-8"); + return [new Document({ text: dataBuffer })]; } } diff --git a/packages/core/src/readers/type.ts b/packages/core/src/readers/type.ts index 233375963..b6e3e9912 100644 --- a/packages/core/src/readers/type.ts +++ b/packages/core/src/readers/type.ts @@ -1,3 +1,4 @@ +import { fs, path } from "@llamaindex/env"; import type { Document } from "../Node.js"; /** @@ -8,10 +9,26 @@ export interface BaseReader { } /** - * A reader takes file paths and imports data into Document objects. + * A FileReader takes file paths and imports data into Document objects. */ -export interface FileReader extends BaseReader { - loadData(filePath: string): Promise<Document[]>; +export abstract class FileReader implements BaseReader { + abstract loadDataAsContent(fileContent: Buffer): Promise<Document[]>; + + async loadData(filePath: string): Promise<Document[]> { + const fileContent = await fs.readFile(filePath); + const docs = await this.loadDataAsContent(fileContent); + docs.forEach(FileReader.addMetaData(filePath)); + return docs; + } + + static addMetaData(filePath: string) { + return (doc: Document, index: number) => { + // generate id as loadDataAsContent is only responsible for the content + doc.id_ = `${filePath}_${index + 1}`; + doc.metadata["file_path"] = path.resolve(filePath); + doc.metadata["file_name"] = path.basename(filePath); + }; + } } // For LlamaParseReader.ts -- GitLab