diff --git a/.changeset/new-worms-teach.md b/.changeset/new-worms-teach.md new file mode 100644 index 0000000000000000000000000000000000000000..40ffcdae3c5f364e847cfd69a8dd0c71aed2104e --- /dev/null +++ b/.changeset/new-worms-teach.md @@ -0,0 +1,5 @@ +--- +"llamaindex": patch +--- + +Add metadata to PDFs and use Uint8Array for readers content diff --git a/examples/readers/src/custom-simple-directory-reader.ts b/examples/readers/src/custom-simple-directory-reader.ts index 817425fbc8b9f04bff301f1a5b12e0ce888f60c2..19549a4d5ea6e5779c096915ca66ca98b226ab94 100644 --- a/examples/readers/src/custom-simple-directory-reader.ts +++ b/examples/readers/src/custom-simple-directory-reader.ts @@ -7,7 +7,7 @@ import { import { TextFileReader } from "llamaindex/readers/TextFileReader"; class ZipReader extends FileReader { - loadDataAsContent(fileContent: Buffer): Promise<Document<Metadata>[]> { + loadDataAsContent(fileContent: Uint8Array): Promise<Document<Metadata>[]> { throw new Error("Implement me"); } } diff --git a/packages/llamaindex/src/readers/CSVReader.ts b/packages/llamaindex/src/readers/CSVReader.ts index 39f0abc4d990732f403365cd2e776dbe3e75d0c3..98e3ae3e2a0a127b333c5a05b177a2a204311eeb 100644 --- a/packages/llamaindex/src/readers/CSVReader.ts +++ b/packages/llamaindex/src/readers/CSVReader.ts @@ -39,8 +39,10 @@ export class PapaCSVReader extends FileReader { * @param {GenericFileSystem} [fs=DEFAULT_FS] - The file system to use for reading the file. * @returns {Promise<Document[]>} */ - async loadDataAsContent(fileContent: Buffer): Promise<Document[]> { - const result = Papa.parse(fileContent.toString("utf-8"), this.papaConfig); + async loadDataAsContent(fileContent: Uint8Array): Promise<Document[]> { + const decoder = new TextDecoder("utf-8"); + const fileContentString = decoder.decode(fileContent); + const result = Papa.parse(fileContentString, this.papaConfig); const textList = result.data.map((row: any) => { // Compatible with header row mode const rowValues = Object.values(row).map((value) => String(value)); diff --git a/packages/llamaindex/src/readers/DocxReader.ts b/packages/llamaindex/src/readers/DocxReader.ts index 1d4357b44590edf5e5dcb91bbadad627a427bf7b..baf1c5330e2cd5a2af75993b9c0420296eb4b774 100644 --- a/packages/llamaindex/src/readers/DocxReader.ts +++ b/packages/llamaindex/src/readers/DocxReader.ts @@ -4,8 +4,11 @@ import { FileReader } from "./type.js"; export class DocxReader extends FileReader { /** DocxParser */ - async loadDataAsContent(fileContent: Buffer): Promise<Document[]> { - const { value } = await mammoth.extractRawText({ buffer: fileContent }); + async loadDataAsContent(fileContent: Uint8Array): Promise<Document[]> { + // Note: await mammoth.extractRawText({ arrayBuffer: fileContent }); is not working + // So we need to convert to Buffer first + const buffer = Buffer.from(fileContent); + const { value } = await mammoth.extractRawText({ buffer }); return [new Document({ text: value })]; } } diff --git a/packages/llamaindex/src/readers/HTMLReader.ts b/packages/llamaindex/src/readers/HTMLReader.ts index 6927444f5a72b6a435b54d19c06b7d26dd9933c3..e5c660a872ee965aa48dfa55d3b26401ff8fd232 100644 --- a/packages/llamaindex/src/readers/HTMLReader.ts +++ b/packages/llamaindex/src/readers/HTMLReader.ts @@ -15,8 +15,9 @@ export class HTMLReader extends FileReader { * @param file Path/name of the file to be loaded. * @returns Promise<Document[]> A Promise object, eventually yielding zero or one Document parsed from the HTML content of the specified file. */ - async loadDataAsContent(fileContent: Buffer): Promise<Document[]> { - const dataBuffer = fileContent.toString("utf-8"); + async loadDataAsContent(fileContent: Uint8Array): Promise<Document[]> { + const decoder = new TextDecoder("utf-8"); + const dataBuffer = decoder.decode(fileContent); const htmlOptions = this.getOptions(); const content = await this.parseContent(dataBuffer, htmlOptions); return [new Document({ text: content })]; diff --git a/packages/llamaindex/src/readers/ImageReader.ts b/packages/llamaindex/src/readers/ImageReader.ts index 0414e2199966fbaa1d84cb2660d9f885d141c188..e88ba7699f29f6719a36044b2458a0bbfcdd5aa6 100644 --- a/packages/llamaindex/src/readers/ImageReader.ts +++ b/packages/llamaindex/src/readers/ImageReader.ts @@ -13,7 +13,7 @@ export class ImageReader extends FileReader { * @param fs fs wrapper interface for getting the file content. * @returns Promise<Document[]> A Promise object, eventually yielding zero or one ImageDocument of the specified file. */ - async loadDataAsContent(fileContent: Buffer): Promise<Document[]> { + async loadDataAsContent(fileContent: Uint8Array): Promise<Document[]> { const blob = new Blob([fileContent]); return [new ImageDocument({ image: blob })]; } diff --git a/packages/llamaindex/src/readers/LlamaParseReader.ts b/packages/llamaindex/src/readers/LlamaParseReader.ts index 7aa27a6fa837dab7a244fcfdc3b7266b9ace161f..5010a5f24b15b64fcab58101ee0ac7d844e36d90 100644 --- a/packages/llamaindex/src/readers/LlamaParseReader.ts +++ b/packages/llamaindex/src/readers/LlamaParseReader.ts @@ -160,7 +160,10 @@ export class LlamaParseReader extends FileReader { } // Create a job for the LlamaParse API - private async createJob(data: Buffer, fileName?: string): Promise<string> { + private async createJob( + data: Uint8Array, + fileName?: string, + ): Promise<string> { // Load data, set the mime type const { mimeType, extension } = await this.getMimeType(data); @@ -272,12 +275,12 @@ export class LlamaParseReader extends FileReader { * Loads data from a file and returns an array of Document objects. * To be used with resultType = "text" and "markdown" * - * @param {Buffer} fileContent - The content of the file to be loaded. + * @param {Uint8Array} fileContent - The content of the file to be loaded. * @param {string} [fileName] - The optional name of the file to be loaded. * @return {Promise<Document[]>} A Promise object that resolves to an array of Document objects. */ async loadDataAsContent( - fileContent: Buffer, + fileContent: Uint8Array, fileName?: string, ): Promise<Document[]> { // Creates a job for the file @@ -365,7 +368,7 @@ export class LlamaParseReader extends FileReader { ); } const arrayBuffer = await response.arrayBuffer(); - const buffer = Buffer.from(arrayBuffer); + const buffer = new Uint8Array(arrayBuffer); await fs.writeFile(imagePath, buffer); images.push(image); @@ -376,7 +379,7 @@ export class LlamaParseReader extends FileReader { } private async getMimeType( - data: Buffer, + data: Uint8Array, ): Promise<{ mimeType: string; extension: string }> { const mimes = filetypemime(data); // Get an array of possible MIME types const extension = Object.keys(SupportedFiles).find( diff --git a/packages/llamaindex/src/readers/MarkdownReader.ts b/packages/llamaindex/src/readers/MarkdownReader.ts index 940fd4ddae51461cab21a95a25c832137751a408..89640739ff58699425d40950c26c056353f1dd48 100644 --- a/packages/llamaindex/src/readers/MarkdownReader.ts +++ b/packages/llamaindex/src/readers/MarkdownReader.ts @@ -89,8 +89,9 @@ export class MarkdownReader extends FileReader { return this.markdownToTups(modifiedContent); } - async loadDataAsContent(fileContent: Buffer): Promise<Document[]> { - const content = fileContent.toString("utf-8"); + async loadDataAsContent(fileContent: Uint8Array): Promise<Document[]> { + const decoder = new TextDecoder("utf-8"); + const content = decoder.decode(fileContent); const tups = this.parseTups(content); const results: Document[] = []; let counter = 0; diff --git a/packages/llamaindex/src/readers/PDFReader.ts b/packages/llamaindex/src/readers/PDFReader.ts index b5d51c989e84f5e691cf877b3a48c2a1a1616ed7..20d66c913d7c960cfa3b30eae94968d07a108a19 100644 --- a/packages/llamaindex/src/readers/PDFReader.ts +++ b/packages/llamaindex/src/readers/PDFReader.ts @@ -1,4 +1,3 @@ -import { fs } from "@llamaindex/env"; import { Document } from "../Node.js"; import { FileReader } from "./type.js"; @@ -6,11 +5,6 @@ import { FileReader } from "./type.js"; * Read the text of a PDF */ export class PDFReader extends FileReader { - async loadData(file: string): Promise<Document[]> { - const content = await fs.readFile(file); - return this.loadDataAsContent(new Uint8Array(content.buffer)); - } - async loadDataAsContent(content: Uint8Array): Promise<Document[]> { const { totalPages, text } = await readPDF(content); return text.map((text, page) => { diff --git a/packages/llamaindex/src/readers/TextFileReader.ts b/packages/llamaindex/src/readers/TextFileReader.ts index 1b575eb7e06e52812bedd8ebf92ddfa8c866c11e..7e98b4afaa830be0f6a4b3dc2892ac4abf7c3a25 100644 --- a/packages/llamaindex/src/readers/TextFileReader.ts +++ b/packages/llamaindex/src/readers/TextFileReader.ts @@ -6,8 +6,9 @@ import { FileReader } from "./type.js"; */ export class TextFileReader extends FileReader { - async loadDataAsContent(fileContent: Buffer): Promise<Document[]> { - const dataBuffer = fileContent.toString("utf-8"); + async loadDataAsContent(fileContent: Uint8Array): Promise<Document[]> { + const decoder = new TextDecoder("utf-8"); + const dataBuffer = decoder.decode(fileContent); return [new Document({ text: dataBuffer })]; } } diff --git a/packages/llamaindex/src/readers/type.ts b/packages/llamaindex/src/readers/type.ts index 90549f3a7a369d4a670cc397a6afc049151443e0..612e9d73f31b287fc86dac08bd96e95b507dd6b7 100644 --- a/packages/llamaindex/src/readers/type.ts +++ b/packages/llamaindex/src/readers/type.ts @@ -13,12 +13,13 @@ export interface BaseReader { */ export abstract class FileReader implements BaseReader { abstract loadDataAsContent( - fileContent: Buffer, + fileContent: Uint8Array, fileName?: string, ): Promise<Document[]>; async loadData(filePath: string): Promise<Document[]> { - const fileContent = await fs.readFile(filePath); + // XXX: create a new Uint8Array to prevent "Please provide binary data as `Uint8Array`, rather than `Buffer`." error in PDFReader + const fileContent = new Uint8Array(await fs.readFile(filePath)); const fileName = path.basename(filePath); const docs = await this.loadDataAsContent(fileContent, fileName); docs.forEach(FileReader.addMetaData(filePath)); diff --git a/packages/llamaindex/tests/indices/json-to-index-struct.test.ts b/packages/llamaindex/tests/indices/json-to-index-struct.test.ts index ed996e9790d57b92b13c6857c35d913263f9a12b..62e944c098df14597f8bcc57db4e258c7e09a771 100644 --- a/packages/llamaindex/tests/indices/json-to-index-struct.test.ts +++ b/packages/llamaindex/tests/indices/json-to-index-struct.test.ts @@ -19,7 +19,6 @@ describe("jsonToIndexStruct", () => { const expected = new IndexDict(); expected.addNode(node); - console.log("expected.toJson()", expected.toJson()); const actual = jsonToIndexStruct(expected.toJson()); expect(isIndexDict(actual)).toBe(true); diff --git a/packages/llamaindex/tests/readers/pdf-reader.test.ts b/packages/llamaindex/tests/readers/pdf-reader.test.ts index 60c13478337738a6c4c37c8fd4d09ebb88921afc..59aac67a01289a3e5ed1a56332a29cc5abb08747 100644 --- a/packages/llamaindex/tests/readers/pdf-reader.test.ts +++ b/packages/llamaindex/tests/readers/pdf-reader.test.ts @@ -6,7 +6,9 @@ describe("pdf reader", () => { test("basic.pdf", async () => { const documents = await reader.loadData("../../../examples/data/basic.pdf"); expect(documents.length).toBe(1); - expect(documents[0].metadata).toEqual({ + expect(documents[0].metadata).toMatchObject({ + file_path: expect.any(String), + file_name: "basic.pdf", page_number: 1, total_pages: 1, });