From 73819bf19d63d7a7169afc7a1628535bbdd10fd9 Mon Sep 17 00:00:00 2001
From: Marcus Schiesser <mail@marcusschiesser.de>
Date: Thu, 6 Jun 2024 11:51:54 +0200
Subject: [PATCH] feat: Unify metadata and ID handling of documents, allow
 files to be read by `Buffer`

---
 .changeset/twenty-crabs-join.md               |  5 ++++
 packages/core/src/readers/CSVReader.ts        | 11 ++++----
 packages/core/src/readers/DocxReader.ts       | 12 ++++-----
 packages/core/src/readers/HTMLReader.ts       | 11 ++++----
 packages/core/src/readers/ImageReader.ts      | 12 ++++-----
 packages/core/src/readers/LlamaParseReader.ts | 25 ++++++++-----------
 packages/core/src/readers/MarkdownReader.ts   | 15 ++++++-----
 packages/core/src/readers/PDFReader.ts        | 13 ++++------
 .../src/readers/SimpleDirectoryReader.edge.ts | 10 +-------
 packages/core/src/readers/TextFileReader.ts   | 11 ++++----
 packages/core/src/readers/type.ts             | 23 ++++++++++++++---
 11 files changed, 74 insertions(+), 74 deletions(-)
 create mode 100644 .changeset/twenty-crabs-join.md

diff --git a/.changeset/twenty-crabs-join.md b/.changeset/twenty-crabs-join.md
new file mode 100644
index 000000000..95afc4bee
--- /dev/null
+++ b/.changeset/twenty-crabs-join.md
@@ -0,0 +1,5 @@
+---
+"llamaindex": patch
+---
+
+Unify metadata and ID handling of documents, allow files to be read by `Buffer`
diff --git a/packages/core/src/readers/CSVReader.ts b/packages/core/src/readers/CSVReader.ts
index 4844b5f1d..39f0abc4d 100644
--- a/packages/core/src/readers/CSVReader.ts
+++ b/packages/core/src/readers/CSVReader.ts
@@ -1,15 +1,14 @@
-import { fs } from "@llamaindex/env";
 import type { ParseConfig } from "papaparse";
 import Papa from "papaparse";
 import { Document } from "../Node.js";
-import type { FileReader } from "./type.js";
+import { FileReader } from "./type.js";
 
 /**
  * papaparse-based csv parser
  * @class CSVReader
  * @implements BaseReader
  */
-export class PapaCSVReader implements FileReader {
+export class PapaCSVReader extends FileReader {
   private concatRows: boolean;
   private colJoiner: string;
   private rowJoiner: string;
@@ -27,6 +26,7 @@ export class PapaCSVReader implements FileReader {
     rowJoiner: string = "\n",
     papaConfig?: ParseConfig,
   ) {
+    super();
     this.concatRows = concatRows;
     this.colJoiner = colJoiner;
     this.rowJoiner = rowJoiner;
@@ -39,9 +39,8 @@ export class PapaCSVReader implements FileReader {
    * @param {GenericFileSystem} [fs=DEFAULT_FS] - The file system to use for reading the file.
    * @returns {Promise<Document[]>}
    */
-  async loadData(file: string): Promise<Document[]> {
-    const fileContent = await fs.readFile(file, "utf-8");
-    const result = Papa.parse(fileContent, this.papaConfig);
+  async loadDataAsContent(fileContent: Buffer): Promise<Document[]> {
+    const result = Papa.parse(fileContent.toString("utf-8"), this.papaConfig);
     const textList = result.data.map((row: any) => {
       // Compatible with header row mode
       const rowValues = Object.values(row).map((value) => String(value));
diff --git a/packages/core/src/readers/DocxReader.ts b/packages/core/src/readers/DocxReader.ts
index 0f79aa90a..1d4357b44 100644
--- a/packages/core/src/readers/DocxReader.ts
+++ b/packages/core/src/readers/DocxReader.ts
@@ -1,13 +1,11 @@
-import { fs } from "@llamaindex/env";
 import mammoth from "mammoth";
 import { Document } from "../Node.js";
-import type { FileReader } from "./type.js";
+import { FileReader } from "./type.js";
 
-export class DocxReader implements FileReader {
+export class DocxReader extends FileReader {
   /** DocxParser */
-  async loadData(file: string): Promise<Document[]> {
-    const dataBuffer = await fs.readFile(file);
-    const { value } = await mammoth.extractRawText({ buffer: dataBuffer });
-    return [new Document({ text: value, id_: file })];
+  async loadDataAsContent(fileContent: Buffer): Promise<Document[]> {
+    const { value } = await mammoth.extractRawText({ buffer: fileContent });
+    return [new Document({ text: value })];
   }
 }
diff --git a/packages/core/src/readers/HTMLReader.ts b/packages/core/src/readers/HTMLReader.ts
index 5b16c5556..6927444f5 100644
--- a/packages/core/src/readers/HTMLReader.ts
+++ b/packages/core/src/readers/HTMLReader.ts
@@ -1,6 +1,5 @@
-import { fs } from "@llamaindex/env";
 import { Document } from "../Node.js";
-import type { FileReader } from "./type.js";
+import { FileReader } from "./type.js";
 
 /**
  * Extract the significant text from an arbitrary HTML document.
@@ -9,18 +8,18 @@ import type { FileReader } from "./type.js";
  * All other tags are removed, and the inner text is kept intact.
  * Html entities (e.g., &amp;) are not decoded.
  */
-export class HTMLReader implements FileReader {
+export class HTMLReader extends FileReader {
   /**
    * Public method for this reader.
    * Required by BaseReader interface.
    * @param file Path/name of the file to be loaded.
    * @returns Promise<Document[]> A Promise object, eventually yielding zero or one Document parsed from the HTML content of the specified file.
    */
-  async loadData(file: string): Promise<Document[]> {
-    const dataBuffer = await fs.readFile(file, "utf-8");
+  async loadDataAsContent(fileContent: Buffer): Promise<Document[]> {
+    const dataBuffer = fileContent.toString("utf-8");
     const htmlOptions = this.getOptions();
     const content = await this.parseContent(dataBuffer, htmlOptions);
-    return [new Document({ text: content, id_: file })];
+    return [new Document({ text: content })];
   }
 
   /**
diff --git a/packages/core/src/readers/ImageReader.ts b/packages/core/src/readers/ImageReader.ts
index 6a3d70b4c..0414e2199 100644
--- a/packages/core/src/readers/ImageReader.ts
+++ b/packages/core/src/readers/ImageReader.ts
@@ -1,12 +1,11 @@
-import { fs } from "@llamaindex/env";
 import type { Document } from "../Node.js";
 import { ImageDocument } from "../Node.js";
-import type { FileReader } from "./type.js";
+import { FileReader } from "./type.js";
 
 /**
  * Reads the content of an image file into a Document object (which stores the image file as a Blob).
  */
-export class ImageReader implements FileReader {
+export class ImageReader extends FileReader {
   /**
    * Public method for this reader.
    * Required by BaseReader interface.
@@ -14,9 +13,8 @@ export class ImageReader implements FileReader {
    * @param fs fs wrapper interface for getting the file content.
    * @returns Promise<Document[]> A Promise object, eventually yielding zero or one ImageDocument of the specified file.
    */
-  async loadData(file: string): Promise<Document[]> {
-    const dataBuffer = await fs.readFile(file);
-    const blob = new Blob([dataBuffer]);
-    return [new ImageDocument({ image: blob, id_: file })];
+  async loadDataAsContent(fileContent: Buffer): Promise<Document[]> {
+    const blob = new Blob([fileContent]);
+    return [new ImageDocument({ image: blob })];
   }
 }
diff --git a/packages/core/src/readers/LlamaParseReader.ts b/packages/core/src/readers/LlamaParseReader.ts
index f292e4609..dc85b6457 100644
--- a/packages/core/src/readers/LlamaParseReader.ts
+++ b/packages/core/src/readers/LlamaParseReader.ts
@@ -1,7 +1,7 @@
 import { fs, getEnv } from "@llamaindex/env";
 import { filetypemime } from "magic-bytes.js";
 import { Document } from "../Node.js";
-import type { FileReader, Language, ResultType } from "./type.js";
+import { FileReader, type Language, type ResultType } from "./type.js";
 
 const SupportedFiles: { [key: string]: string } = {
   ".pdf": "application/pdf",
@@ -105,7 +105,7 @@ const SupportedFiles: { [key: string]: string } = {
  * Represents a reader for parsing files using the LlamaParse API.
  * See https://github.com/run-llama/llama_parse
  */
-export class LlamaParseReader implements FileReader {
+export class LlamaParseReader extends FileReader {
   // The API key for the LlamaParse API. Can be set as an environment variable: LLAMA_CLOUD_API_KEY
   apiKey: string;
   // The base URL of the Llama Parsing API.
@@ -133,6 +133,7 @@ export class LlamaParseReader implements FileReader {
   // numWorkers is implemented in SimpleDirectoryReader
 
   constructor(params: Partial<LlamaParseReader> = {}) {
+    super();
     Object.assign(this, params);
     params.apiKey = params.apiKey ?? getEnv("LLAMA_CLOUD_API_KEY");
     if (!params.apiKey) {
@@ -151,17 +152,16 @@ export class LlamaParseReader implements FileReader {
   }
 
   // Create a job for the LlamaParse API
-  private async createJob(file: string): Promise<string> {
+  private async createJob(data: Buffer): Promise<string> {
     // Load data, set the mime type
-    const data = await fs.readFile(file);
     const mimeType = await this.getMimeType(data);
 
     if (this.verbose) {
-      console.log(`Starting load for file: ${file}`);
+      console.log(`Starting load for file with mimeType: ${mimeType}`);
     }
 
     const body = new FormData();
-    body.set("file", new Blob([data], { type: mimeType }), file);
+    body.set("file", new Blob([data], { type: mimeType }));
     body.append("language", this.language);
     body.append("parsing_instruction", this.parsingInstruction);
     body.append("skip_diagonal_text", this.skipDiagonalText.toString());
@@ -251,15 +251,12 @@ export class LlamaParseReader implements FileReader {
    * Loads data from a file and returns an array of Document objects.
    * To be used with resultType = "text" and "markdown"
    *
-   * @param {string} file - The path to the file to be loaded.
+   * @param {Buffer} fileContent - The content of the file to be loaded.
    * @return {Promise<Document[]>} A Promise object that resolves to an array of Document objects.
    */
-  async loadData(file: string): Promise<Document[]> {
-    // Set metadata to contain file_path
-    const metadata = { file_path: file };
-
+  async loadDataAsContent(fileContent: Buffer): Promise<Document[]> {
     // Creates a job for the file
-    const jobId = await this.createJob(file);
+    const jobId = await this.createJob(fileContent);
     if (this.verbose) {
       console.log(`Started parsing the file under job id ${jobId}`);
     }
@@ -269,7 +266,6 @@ export class LlamaParseReader implements FileReader {
     return [
       new Document({
         text: resultJson[this.resultType],
-        metadata: metadata,
       }),
     ];
   }
@@ -281,8 +277,9 @@ export class LlamaParseReader implements FileReader {
    * @return {Promise<Record<string, any>>} A Promise that resolves to the JSON object.
    */
   async loadJson(file: string): Promise<Record<string, any>> {
+    const data = await fs.readFile(file);
     // Creates a job for the file
-    const jobId = await this.createJob(file);
+    const jobId = await this.createJob(data);
     if (this.verbose) {
       console.log(`Started parsing the file under job id ${jobId}`);
     }
diff --git a/packages/core/src/readers/MarkdownReader.ts b/packages/core/src/readers/MarkdownReader.ts
index 47a1ecc44..940fd4dda 100644
--- a/packages/core/src/readers/MarkdownReader.ts
+++ b/packages/core/src/readers/MarkdownReader.ts
@@ -1,6 +1,5 @@
-import { fs } from "@llamaindex/env";
 import { Document } from "../Node.js";
-import type { FileReader } from "./type.js";
+import { FileReader } from "./type.js";
 
 type MarkdownTuple = [string | null, string];
 
@@ -8,7 +7,7 @@ type MarkdownTuple = [string | null, string];
  * Extract text from markdown files.
  * Returns dictionary with keys as headers and values as the text between headers.
  */
-export class MarkdownReader implements FileReader {
+export class MarkdownReader extends FileReader {
   private _removeHyperlinks: boolean;
   private _removeImages: boolean;
 
@@ -17,6 +16,7 @@ export class MarkdownReader implements FileReader {
    * @param {boolean} [removeImages=true] - Indicates whether images should be removed.
    */
   constructor(removeHyperlinks: boolean = true, removeImages: boolean = true) {
+    super();
     this._removeHyperlinks = removeHyperlinks;
     this._removeImages = removeImages;
   }
@@ -89,18 +89,17 @@ export class MarkdownReader implements FileReader {
     return this.markdownToTups(modifiedContent);
   }
 
-  async loadData(file: string): Promise<Document[]> {
-    const content = await fs.readFile(file, "utf-8");
+  async loadDataAsContent(fileContent: Buffer): Promise<Document[]> {
+    const content = fileContent.toString("utf-8");
     const tups = this.parseTups(content);
     const results: Document[] = [];
     let counter = 0;
     for (const [header, value] of tups) {
-      const id_ = `${file}_${counter}`;
       if (header) {
         const text = `\n\n${header}\n${value}`;
-        results.push(new Document({ text, id_ }));
+        results.push(new Document({ text }));
       } else {
-        results.push(new Document({ text: value, id_ }));
+        results.push(new Document({ text: value }));
       }
       counter += 1;
     }
diff --git a/packages/core/src/readers/PDFReader.ts b/packages/core/src/readers/PDFReader.ts
index bda8ab753..50ff9df7d 100644
--- a/packages/core/src/readers/PDFReader.ts
+++ b/packages/core/src/readers/PDFReader.ts
@@ -1,20 +1,17 @@
-import { fs } from "@llamaindex/env";
 import { Document } from "../Node.js";
-import type { FileReader } from "./type.js";
+import { FileReader } from "./type.js";
 
 /**
  * Read the text of a PDF
  */
-export class PDFReader implements FileReader {
-  async loadData(file: string): Promise<Document[]> {
-    const content = await fs.readFile(file);
-    const pages = await readPDF(content);
+export class PDFReader extends FileReader {
+  async loadDataAsContent(fileContent: Buffer): Promise<Document[]> {
+    const pages = await readPDF(fileContent);
     return pages.map((text, page) => {
-      const id_ = `${file}_${page + 1}`;
       const metadata = {
         page_number: page + 1,
       };
-      return new Document({ text, id_, metadata });
+      return new Document({ text, metadata });
     });
   }
 }
diff --git a/packages/core/src/readers/SimpleDirectoryReader.edge.ts b/packages/core/src/readers/SimpleDirectoryReader.edge.ts
index 589f45585..5c794d561 100644
--- a/packages/core/src/readers/SimpleDirectoryReader.edge.ts
+++ b/packages/core/src/readers/SimpleDirectoryReader.edge.ts
@@ -1,5 +1,5 @@
 import { path } from "@llamaindex/env";
-import { Document, type Metadata } from "../Node.js";
+import { Document } from "../Node.js";
 import { walk } from "../storage/FileSystem.js";
 import { TextFileReader } from "./TextFileReader.js";
 import type { BaseReader, FileReader } from "./type.js";
@@ -136,7 +136,6 @@ export class SimpleDirectoryReader implements BaseReader {
       }
 
       const fileDocs = await reader.loadData(filePath);
-      fileDocs.forEach(addMetaData(filePath));
 
       // Observer can still cancel addition of the resulting docs from this file
       if (this.doObserverCheck("file", filePath, ReaderStatus.COMPLETE)) {
@@ -167,10 +166,3 @@ export class SimpleDirectoryReader implements BaseReader {
     return true;
   }
 }
-
-function addMetaData(filePath: string): (doc: Document<Metadata>) => void {
-  return (doc: Document<Metadata>) => {
-    doc.metadata["file_path"] = path.resolve(filePath);
-    doc.metadata["file_name"] = path.basename(filePath);
-  };
-}
diff --git a/packages/core/src/readers/TextFileReader.ts b/packages/core/src/readers/TextFileReader.ts
index 0b91e69bf..1b575eb7e 100644
--- a/packages/core/src/readers/TextFileReader.ts
+++ b/packages/core/src/readers/TextFileReader.ts
@@ -1,14 +1,13 @@
-import { fs } from "@llamaindex/env";
 import { Document } from "../Node.js";
-import type { FileReader } from "./type.js";
+import { FileReader } from "./type.js";
 
 /**
  * Read a .txt file
  */
 
-export class TextFileReader implements FileReader {
-  async loadData(file: string): Promise<Document[]> {
-    const dataBuffer = await fs.readFile(file, "utf-8");
-    return [new Document({ text: dataBuffer, id_: file })];
+export class TextFileReader extends FileReader {
+  async loadDataAsContent(fileContent: Buffer): Promise<Document[]> {
+    const dataBuffer = fileContent.toString("utf-8");
+    return [new Document({ text: dataBuffer })];
   }
 }
diff --git a/packages/core/src/readers/type.ts b/packages/core/src/readers/type.ts
index 233375963..b6e3e9912 100644
--- a/packages/core/src/readers/type.ts
+++ b/packages/core/src/readers/type.ts
@@ -1,3 +1,4 @@
+import { fs, path } from "@llamaindex/env";
 import type { Document } from "../Node.js";
 
 /**
@@ -8,10 +9,26 @@ export interface BaseReader {
 }
 
 /**
- * A reader takes file paths and imports data into Document objects.
+ * A FileReader takes file paths and imports data into Document objects.
  */
-export interface FileReader extends BaseReader {
-  loadData(filePath: string): Promise<Document[]>;
+export abstract class FileReader implements BaseReader {
+  abstract loadDataAsContent(fileContent: Buffer): Promise<Document[]>;
+
+  async loadData(filePath: string): Promise<Document[]> {
+    const fileContent = await fs.readFile(filePath);
+    const docs = await this.loadDataAsContent(fileContent);
+    docs.forEach(FileReader.addMetaData(filePath));
+    return docs;
+  }
+
+  static addMetaData(filePath: string) {
+    return (doc: Document, index: number) => {
+      // generate id as loadDataAsContent is only responsible for the content
+      doc.id_ = `${filePath}_${index + 1}`;
+      doc.metadata["file_path"] = path.resolve(filePath);
+      doc.metadata["file_name"] = path.basename(filePath);
+    };
+  }
 }
 
 // For LlamaParseReader.ts
-- 
GitLab