Skip to content
Snippets Groups Projects
Unverified Commit 08c55ec2 authored by Marcus Schiesser's avatar Marcus Schiesser Committed by GitHub
Browse files

fix: Add metadata to PDFs and use Uint8Array for readers content (#980)

parent 394e7975
No related branches found
No related tags found
No related merge requests found
Showing
with 39 additions and 27 deletions
---
"llamaindex": patch
---
Add metadata to PDFs and use Uint8Array for readers content
...@@ -7,7 +7,7 @@ import { ...@@ -7,7 +7,7 @@ import {
import { TextFileReader } from "llamaindex/readers/TextFileReader"; import { TextFileReader } from "llamaindex/readers/TextFileReader";
class ZipReader extends FileReader { class ZipReader extends FileReader {
loadDataAsContent(fileContent: Buffer): Promise<Document<Metadata>[]> { loadDataAsContent(fileContent: Uint8Array): Promise<Document<Metadata>[]> {
throw new Error("Implement me"); throw new Error("Implement me");
} }
} }
......
...@@ -39,8 +39,10 @@ export class PapaCSVReader extends FileReader { ...@@ -39,8 +39,10 @@ export class PapaCSVReader extends FileReader {
* @param {GenericFileSystem} [fs=DEFAULT_FS] - The file system to use for reading the file. * @param {GenericFileSystem} [fs=DEFAULT_FS] - The file system to use for reading the file.
* @returns {Promise<Document[]>} * @returns {Promise<Document[]>}
*/ */
async loadDataAsContent(fileContent: Buffer): Promise<Document[]> { async loadDataAsContent(fileContent: Uint8Array): Promise<Document[]> {
const result = Papa.parse(fileContent.toString("utf-8"), this.papaConfig); const decoder = new TextDecoder("utf-8");
const fileContentString = decoder.decode(fileContent);
const result = Papa.parse(fileContentString, this.papaConfig);
const textList = result.data.map((row: any) => { const textList = result.data.map((row: any) => {
// Compatible with header row mode // Compatible with header row mode
const rowValues = Object.values(row).map((value) => String(value)); const rowValues = Object.values(row).map((value) => String(value));
......
...@@ -4,8 +4,11 @@ import { FileReader } from "./type.js"; ...@@ -4,8 +4,11 @@ import { FileReader } from "./type.js";
export class DocxReader extends FileReader { export class DocxReader extends FileReader {
/** DocxParser */ /** DocxParser */
async loadDataAsContent(fileContent: Buffer): Promise<Document[]> { async loadDataAsContent(fileContent: Uint8Array): Promise<Document[]> {
const { value } = await mammoth.extractRawText({ buffer: fileContent }); // Note: await mammoth.extractRawText({ arrayBuffer: fileContent }); is not working
// So we need to convert to Buffer first
const buffer = Buffer.from(fileContent);
const { value } = await mammoth.extractRawText({ buffer });
return [new Document({ text: value })]; return [new Document({ text: value })];
} }
} }
...@@ -15,8 +15,9 @@ export class HTMLReader extends FileReader { ...@@ -15,8 +15,9 @@ export class HTMLReader extends FileReader {
* @param file Path/name of the file to be loaded. * @param file Path/name of the file to be loaded.
* @returns Promise<Document[]> A Promise object, eventually yielding zero or one Document parsed from the HTML content of the specified file. * @returns Promise<Document[]> A Promise object, eventually yielding zero or one Document parsed from the HTML content of the specified file.
*/ */
async loadDataAsContent(fileContent: Buffer): Promise<Document[]> { async loadDataAsContent(fileContent: Uint8Array): Promise<Document[]> {
const dataBuffer = fileContent.toString("utf-8"); const decoder = new TextDecoder("utf-8");
const dataBuffer = decoder.decode(fileContent);
const htmlOptions = this.getOptions(); const htmlOptions = this.getOptions();
const content = await this.parseContent(dataBuffer, htmlOptions); const content = await this.parseContent(dataBuffer, htmlOptions);
return [new Document({ text: content })]; return [new Document({ text: content })];
......
...@@ -13,7 +13,7 @@ export class ImageReader extends FileReader { ...@@ -13,7 +13,7 @@ export class ImageReader extends FileReader {
* @param fs fs wrapper interface for getting the file content. * @param fs fs wrapper interface for getting the file content.
* @returns Promise<Document[]> A Promise object, eventually yielding zero or one ImageDocument of the specified file. * @returns Promise<Document[]> A Promise object, eventually yielding zero or one ImageDocument of the specified file.
*/ */
async loadDataAsContent(fileContent: Buffer): Promise<Document[]> { async loadDataAsContent(fileContent: Uint8Array): Promise<Document[]> {
const blob = new Blob([fileContent]); const blob = new Blob([fileContent]);
return [new ImageDocument({ image: blob })]; return [new ImageDocument({ image: blob })];
} }
......
...@@ -160,7 +160,10 @@ export class LlamaParseReader extends FileReader { ...@@ -160,7 +160,10 @@ export class LlamaParseReader extends FileReader {
} }
// Create a job for the LlamaParse API // Create a job for the LlamaParse API
private async createJob(data: Buffer, fileName?: string): Promise<string> { private async createJob(
data: Uint8Array,
fileName?: string,
): Promise<string> {
// Load data, set the mime type // Load data, set the mime type
const { mimeType, extension } = await this.getMimeType(data); const { mimeType, extension } = await this.getMimeType(data);
...@@ -272,12 +275,12 @@ export class LlamaParseReader extends FileReader { ...@@ -272,12 +275,12 @@ export class LlamaParseReader extends FileReader {
* Loads data from a file and returns an array of Document objects. * Loads data from a file and returns an array of Document objects.
* To be used with resultType = "text" and "markdown" * To be used with resultType = "text" and "markdown"
* *
* @param {Buffer} fileContent - The content of the file to be loaded. * @param {Uint8Array} fileContent - The content of the file to be loaded.
* @param {string} [fileName] - The optional name of the file to be loaded. * @param {string} [fileName] - The optional name of the file to be loaded.
* @return {Promise<Document[]>} A Promise object that resolves to an array of Document objects. * @return {Promise<Document[]>} A Promise object that resolves to an array of Document objects.
*/ */
async loadDataAsContent( async loadDataAsContent(
fileContent: Buffer, fileContent: Uint8Array,
fileName?: string, fileName?: string,
): Promise<Document[]> { ): Promise<Document[]> {
// Creates a job for the file // Creates a job for the file
...@@ -365,7 +368,7 @@ export class LlamaParseReader extends FileReader { ...@@ -365,7 +368,7 @@ export class LlamaParseReader extends FileReader {
); );
} }
const arrayBuffer = await response.arrayBuffer(); const arrayBuffer = await response.arrayBuffer();
const buffer = Buffer.from(arrayBuffer); const buffer = new Uint8Array(arrayBuffer);
await fs.writeFile(imagePath, buffer); await fs.writeFile(imagePath, buffer);
images.push(image); images.push(image);
...@@ -376,7 +379,7 @@ export class LlamaParseReader extends FileReader { ...@@ -376,7 +379,7 @@ export class LlamaParseReader extends FileReader {
} }
private async getMimeType( private async getMimeType(
data: Buffer, data: Uint8Array,
): Promise<{ mimeType: string; extension: string }> { ): Promise<{ mimeType: string; extension: string }> {
const mimes = filetypemime(data); // Get an array of possible MIME types const mimes = filetypemime(data); // Get an array of possible MIME types
const extension = Object.keys(SupportedFiles).find( const extension = Object.keys(SupportedFiles).find(
......
...@@ -89,8 +89,9 @@ export class MarkdownReader extends FileReader { ...@@ -89,8 +89,9 @@ export class MarkdownReader extends FileReader {
return this.markdownToTups(modifiedContent); return this.markdownToTups(modifiedContent);
} }
async loadDataAsContent(fileContent: Buffer): Promise<Document[]> { async loadDataAsContent(fileContent: Uint8Array): Promise<Document[]> {
const content = fileContent.toString("utf-8"); const decoder = new TextDecoder("utf-8");
const content = decoder.decode(fileContent);
const tups = this.parseTups(content); const tups = this.parseTups(content);
const results: Document[] = []; const results: Document[] = [];
let counter = 0; let counter = 0;
......
import { fs } from "@llamaindex/env";
import { Document } from "../Node.js"; import { Document } from "../Node.js";
import { FileReader } from "./type.js"; import { FileReader } from "./type.js";
...@@ -6,11 +5,6 @@ import { FileReader } from "./type.js"; ...@@ -6,11 +5,6 @@ import { FileReader } from "./type.js";
* Read the text of a PDF * Read the text of a PDF
*/ */
export class PDFReader extends FileReader { export class PDFReader extends FileReader {
async loadData(file: string): Promise<Document[]> {
const content = await fs.readFile(file);
return this.loadDataAsContent(new Uint8Array(content.buffer));
}
async loadDataAsContent(content: Uint8Array): Promise<Document[]> { async loadDataAsContent(content: Uint8Array): Promise<Document[]> {
const { totalPages, text } = await readPDF(content); const { totalPages, text } = await readPDF(content);
return text.map((text, page) => { return text.map((text, page) => {
......
...@@ -6,8 +6,9 @@ import { FileReader } from "./type.js"; ...@@ -6,8 +6,9 @@ import { FileReader } from "./type.js";
*/ */
export class TextFileReader extends FileReader { export class TextFileReader extends FileReader {
async loadDataAsContent(fileContent: Buffer): Promise<Document[]> { async loadDataAsContent(fileContent: Uint8Array): Promise<Document[]> {
const dataBuffer = fileContent.toString("utf-8"); const decoder = new TextDecoder("utf-8");
const dataBuffer = decoder.decode(fileContent);
return [new Document({ text: dataBuffer })]; return [new Document({ text: dataBuffer })];
} }
} }
...@@ -13,12 +13,13 @@ export interface BaseReader { ...@@ -13,12 +13,13 @@ export interface BaseReader {
*/ */
export abstract class FileReader implements BaseReader { export abstract class FileReader implements BaseReader {
abstract loadDataAsContent( abstract loadDataAsContent(
fileContent: Buffer, fileContent: Uint8Array,
fileName?: string, fileName?: string,
): Promise<Document[]>; ): Promise<Document[]>;
async loadData(filePath: string): Promise<Document[]> { async loadData(filePath: string): Promise<Document[]> {
const fileContent = await fs.readFile(filePath); // XXX: create a new Uint8Array to prevent "Please provide binary data as `Uint8Array`, rather than `Buffer`." error in PDFReader
const fileContent = new Uint8Array(await fs.readFile(filePath));
const fileName = path.basename(filePath); const fileName = path.basename(filePath);
const docs = await this.loadDataAsContent(fileContent, fileName); const docs = await this.loadDataAsContent(fileContent, fileName);
docs.forEach(FileReader.addMetaData(filePath)); docs.forEach(FileReader.addMetaData(filePath));
......
...@@ -19,7 +19,6 @@ describe("jsonToIndexStruct", () => { ...@@ -19,7 +19,6 @@ describe("jsonToIndexStruct", () => {
const expected = new IndexDict(); const expected = new IndexDict();
expected.addNode(node); expected.addNode(node);
console.log("expected.toJson()", expected.toJson());
const actual = jsonToIndexStruct(expected.toJson()); const actual = jsonToIndexStruct(expected.toJson());
expect(isIndexDict(actual)).toBe(true); expect(isIndexDict(actual)).toBe(true);
......
...@@ -6,7 +6,9 @@ describe("pdf reader", () => { ...@@ -6,7 +6,9 @@ describe("pdf reader", () => {
test("basic.pdf", async () => { test("basic.pdf", async () => {
const documents = await reader.loadData("../../../examples/data/basic.pdf"); const documents = await reader.loadData("../../../examples/data/basic.pdf");
expect(documents.length).toBe(1); expect(documents.length).toBe(1);
expect(documents[0].metadata).toEqual({ expect(documents[0].metadata).toMatchObject({
file_path: expect.any(String),
file_name: "basic.pdf",
page_number: 1, page_number: 1,
total_pages: 1, total_pages: 1,
}); });
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment