Skip to content
Snippets Groups Projects
Unverified Commit 08c55ec2 authored by Marcus Schiesser's avatar Marcus Schiesser Committed by GitHub
Browse files

fix: Add metadata to PDFs and use Uint8Array for readers content (#980)

parent 394e7975
No related branches found
No related tags found
No related merge requests found
Showing
with 39 additions and 27 deletions
---
"llamaindex": patch
---
Add metadata to PDFs and use Uint8Array for readers content
......@@ -7,7 +7,7 @@ import {
import { TextFileReader } from "llamaindex/readers/TextFileReader";
class ZipReader extends FileReader {
loadDataAsContent(fileContent: Buffer): Promise<Document<Metadata>[]> {
loadDataAsContent(fileContent: Uint8Array): Promise<Document<Metadata>[]> {
throw new Error("Implement me");
}
}
......
......@@ -39,8 +39,10 @@ export class PapaCSVReader extends FileReader {
* @param {GenericFileSystem} [fs=DEFAULT_FS] - The file system to use for reading the file.
* @returns {Promise<Document[]>}
*/
async loadDataAsContent(fileContent: Buffer): Promise<Document[]> {
const result = Papa.parse(fileContent.toString("utf-8"), this.papaConfig);
async loadDataAsContent(fileContent: Uint8Array): Promise<Document[]> {
const decoder = new TextDecoder("utf-8");
const fileContentString = decoder.decode(fileContent);
const result = Papa.parse(fileContentString, this.papaConfig);
const textList = result.data.map((row: any) => {
// Compatible with header row mode
const rowValues = Object.values(row).map((value) => String(value));
......
......@@ -4,8 +4,11 @@ import { FileReader } from "./type.js";
export class DocxReader extends FileReader {
/** DocxParser */
async loadDataAsContent(fileContent: Buffer): Promise<Document[]> {
const { value } = await mammoth.extractRawText({ buffer: fileContent });
async loadDataAsContent(fileContent: Uint8Array): Promise<Document[]> {
// Note: await mammoth.extractRawText({ arrayBuffer: fileContent }); is not working
// So we need to convert to Buffer first
const buffer = Buffer.from(fileContent);
const { value } = await mammoth.extractRawText({ buffer });
return [new Document({ text: value })];
}
}
......@@ -15,8 +15,9 @@ export class HTMLReader extends FileReader {
* @param file Path/name of the file to be loaded.
* @returns Promise<Document[]> A Promise object, eventually yielding zero or one Document parsed from the HTML content of the specified file.
*/
async loadDataAsContent(fileContent: Buffer): Promise<Document[]> {
const dataBuffer = fileContent.toString("utf-8");
async loadDataAsContent(fileContent: Uint8Array): Promise<Document[]> {
const decoder = new TextDecoder("utf-8");
const dataBuffer = decoder.decode(fileContent);
const htmlOptions = this.getOptions();
const content = await this.parseContent(dataBuffer, htmlOptions);
return [new Document({ text: content })];
......
......@@ -13,7 +13,7 @@ export class ImageReader extends FileReader {
* @param fs fs wrapper interface for getting the file content.
* @returns Promise<Document[]> A Promise object, eventually yielding zero or one ImageDocument of the specified file.
*/
async loadDataAsContent(fileContent: Buffer): Promise<Document[]> {
async loadDataAsContent(fileContent: Uint8Array): Promise<Document[]> {
const blob = new Blob([fileContent]);
return [new ImageDocument({ image: blob })];
}
......
......@@ -160,7 +160,10 @@ export class LlamaParseReader extends FileReader {
}
// Create a job for the LlamaParse API
private async createJob(data: Buffer, fileName?: string): Promise<string> {
private async createJob(
data: Uint8Array,
fileName?: string,
): Promise<string> {
// Load data, set the mime type
const { mimeType, extension } = await this.getMimeType(data);
......@@ -272,12 +275,12 @@ export class LlamaParseReader extends FileReader {
* Loads data from a file and returns an array of Document objects.
* To be used with resultType = "text" and "markdown"
*
* @param {Buffer} fileContent - The content of the file to be loaded.
* @param {Uint8Array} fileContent - The content of the file to be loaded.
* @param {string} [fileName] - The optional name of the file to be loaded.
* @return {Promise<Document[]>} A Promise object that resolves to an array of Document objects.
*/
async loadDataAsContent(
fileContent: Buffer,
fileContent: Uint8Array,
fileName?: string,
): Promise<Document[]> {
// Creates a job for the file
......@@ -365,7 +368,7 @@ export class LlamaParseReader extends FileReader {
);
}
const arrayBuffer = await response.arrayBuffer();
const buffer = Buffer.from(arrayBuffer);
const buffer = new Uint8Array(arrayBuffer);
await fs.writeFile(imagePath, buffer);
images.push(image);
......@@ -376,7 +379,7 @@ export class LlamaParseReader extends FileReader {
}
private async getMimeType(
data: Buffer,
data: Uint8Array,
): Promise<{ mimeType: string; extension: string }> {
const mimes = filetypemime(data); // Get an array of possible MIME types
const extension = Object.keys(SupportedFiles).find(
......
......@@ -89,8 +89,9 @@ export class MarkdownReader extends FileReader {
return this.markdownToTups(modifiedContent);
}
async loadDataAsContent(fileContent: Buffer): Promise<Document[]> {
const content = fileContent.toString("utf-8");
async loadDataAsContent(fileContent: Uint8Array): Promise<Document[]> {
const decoder = new TextDecoder("utf-8");
const content = decoder.decode(fileContent);
const tups = this.parseTups(content);
const results: Document[] = [];
let counter = 0;
......
import { fs } from "@llamaindex/env";
import { Document } from "../Node.js";
import { FileReader } from "./type.js";
......@@ -6,11 +5,6 @@ import { FileReader } from "./type.js";
* Read the text of a PDF
*/
export class PDFReader extends FileReader {
async loadData(file: string): Promise<Document[]> {
const content = await fs.readFile(file);
return this.loadDataAsContent(new Uint8Array(content.buffer));
}
async loadDataAsContent(content: Uint8Array): Promise<Document[]> {
const { totalPages, text } = await readPDF(content);
return text.map((text, page) => {
......
......@@ -6,8 +6,9 @@ import { FileReader } from "./type.js";
*/
export class TextFileReader extends FileReader {
async loadDataAsContent(fileContent: Buffer): Promise<Document[]> {
const dataBuffer = fileContent.toString("utf-8");
async loadDataAsContent(fileContent: Uint8Array): Promise<Document[]> {
const decoder = new TextDecoder("utf-8");
const dataBuffer = decoder.decode(fileContent);
return [new Document({ text: dataBuffer })];
}
}
......@@ -13,12 +13,13 @@ export interface BaseReader {
*/
export abstract class FileReader implements BaseReader {
abstract loadDataAsContent(
fileContent: Buffer,
fileContent: Uint8Array,
fileName?: string,
): Promise<Document[]>;
async loadData(filePath: string): Promise<Document[]> {
const fileContent = await fs.readFile(filePath);
// XXX: create a new Uint8Array to prevent "Please provide binary data as `Uint8Array`, rather than `Buffer`." error in PDFReader
const fileContent = new Uint8Array(await fs.readFile(filePath));
const fileName = path.basename(filePath);
const docs = await this.loadDataAsContent(fileContent, fileName);
docs.forEach(FileReader.addMetaData(filePath));
......
......@@ -19,7 +19,6 @@ describe("jsonToIndexStruct", () => {
const expected = new IndexDict();
expected.addNode(node);
console.log("expected.toJson()", expected.toJson());
const actual = jsonToIndexStruct(expected.toJson());
expect(isIndexDict(actual)).toBe(true);
......
......@@ -6,7 +6,9 @@ describe("pdf reader", () => {
test("basic.pdf", async () => {
const documents = await reader.loadData("../../../examples/data/basic.pdf");
expect(documents.length).toBe(1);
expect(documents[0].metadata).toEqual({
expect(documents[0].metadata).toMatchObject({
file_path: expect.any(String),
file_name: "basic.pdf",
page_number: 1,
total_pages: 1,
});
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment