Skip to content
Snippets Groups Projects
Commit 73819bf1 authored by Marcus Schiesser's avatar Marcus Schiesser
Browse files

feat: Unify metadata and ID handling of documents, allow files to be read by `Buffer`

parent d10cca28
No related branches found
No related tags found
No related merge requests found
---
"llamaindex": patch
---
Unify metadata and ID handling of documents, allow files to be read by `Buffer`
import { fs } from "@llamaindex/env";
import type { ParseConfig } from "papaparse";
import Papa from "papaparse";
import { Document } from "../Node.js";
import type { FileReader } from "./type.js";
import { FileReader } from "./type.js";
/**
* papaparse-based csv parser
* @class CSVReader
* @implements BaseReader
*/
export class PapaCSVReader implements FileReader {
export class PapaCSVReader extends FileReader {
private concatRows: boolean;
private colJoiner: string;
private rowJoiner: string;
......@@ -27,6 +26,7 @@ export class PapaCSVReader implements FileReader {
rowJoiner: string = "\n",
papaConfig?: ParseConfig,
) {
super();
this.concatRows = concatRows;
this.colJoiner = colJoiner;
this.rowJoiner = rowJoiner;
......@@ -39,9 +39,8 @@ export class PapaCSVReader implements FileReader {
* @param {GenericFileSystem} [fs=DEFAULT_FS] - The file system to use for reading the file.
* @returns {Promise<Document[]>}
*/
async loadData(file: string): Promise<Document[]> {
const fileContent = await fs.readFile(file, "utf-8");
const result = Papa.parse(fileContent, this.papaConfig);
async loadDataAsContent(fileContent: Buffer): Promise<Document[]> {
const result = Papa.parse(fileContent.toString("utf-8"), this.papaConfig);
const textList = result.data.map((row: any) => {
// Compatible with header row mode
const rowValues = Object.values(row).map((value) => String(value));
......
import { fs } from "@llamaindex/env";
import mammoth from "mammoth";
import { Document } from "../Node.js";
import type { FileReader } from "./type.js";
import { FileReader } from "./type.js";
export class DocxReader implements FileReader {
export class DocxReader extends FileReader {
/** DocxParser */
async loadData(file: string): Promise<Document[]> {
const dataBuffer = await fs.readFile(file);
const { value } = await mammoth.extractRawText({ buffer: dataBuffer });
return [new Document({ text: value, id_: file })];
async loadDataAsContent(fileContent: Buffer): Promise<Document[]> {
const { value } = await mammoth.extractRawText({ buffer: fileContent });
return [new Document({ text: value })];
}
}
import { fs } from "@llamaindex/env";
import { Document } from "../Node.js";
import type { FileReader } from "./type.js";
import { FileReader } from "./type.js";
/**
* Extract the significant text from an arbitrary HTML document.
......@@ -9,18 +8,18 @@ import type { FileReader } from "./type.js";
* All other tags are removed, and the inner text is kept intact.
* Html entities (e.g., &amp;) are not decoded.
*/
export class HTMLReader implements FileReader {
export class HTMLReader extends FileReader {
/**
* Public method for this reader.
* Required by BaseReader interface.
* @param file Path/name of the file to be loaded.
* @returns Promise<Document[]> A Promise object, eventually yielding zero or one Document parsed from the HTML content of the specified file.
*/
async loadData(file: string): Promise<Document[]> {
const dataBuffer = await fs.readFile(file, "utf-8");
async loadDataAsContent(fileContent: Buffer): Promise<Document[]> {
const dataBuffer = fileContent.toString("utf-8");
const htmlOptions = this.getOptions();
const content = await this.parseContent(dataBuffer, htmlOptions);
return [new Document({ text: content, id_: file })];
return [new Document({ text: content })];
}
/**
......
import { fs } from "@llamaindex/env";
import type { Document } from "../Node.js";
import { ImageDocument } from "../Node.js";
import type { FileReader } from "./type.js";
import { FileReader } from "./type.js";
/**
* Reads the content of an image file into a Document object (which stores the image file as a Blob).
*/
export class ImageReader implements FileReader {
export class ImageReader extends FileReader {
/**
* Public method for this reader.
* Required by BaseReader interface.
......@@ -14,9 +13,8 @@ export class ImageReader implements FileReader {
* @param fs fs wrapper interface for getting the file content.
* @returns Promise<Document[]> A Promise object, eventually yielding zero or one ImageDocument of the specified file.
*/
async loadData(file: string): Promise<Document[]> {
const dataBuffer = await fs.readFile(file);
const blob = new Blob([dataBuffer]);
return [new ImageDocument({ image: blob, id_: file })];
async loadDataAsContent(fileContent: Buffer): Promise<Document[]> {
const blob = new Blob([fileContent]);
return [new ImageDocument({ image: blob })];
}
}
import { fs, getEnv } from "@llamaindex/env";
import { filetypemime } from "magic-bytes.js";
import { Document } from "../Node.js";
import type { FileReader, Language, ResultType } from "./type.js";
import { FileReader, type Language, type ResultType } from "./type.js";
const SupportedFiles: { [key: string]: string } = {
".pdf": "application/pdf",
......@@ -105,7 +105,7 @@ const SupportedFiles: { [key: string]: string } = {
* Represents a reader for parsing files using the LlamaParse API.
* See https://github.com/run-llama/llama_parse
*/
export class LlamaParseReader implements FileReader {
export class LlamaParseReader extends FileReader {
// The API key for the LlamaParse API. Can be set as an environment variable: LLAMA_CLOUD_API_KEY
apiKey: string;
// The base URL of the Llama Parsing API.
......@@ -133,6 +133,7 @@ export class LlamaParseReader implements FileReader {
// numWorkers is implemented in SimpleDirectoryReader
constructor(params: Partial<LlamaParseReader> = {}) {
super();
Object.assign(this, params);
params.apiKey = params.apiKey ?? getEnv("LLAMA_CLOUD_API_KEY");
if (!params.apiKey) {
......@@ -151,17 +152,16 @@ export class LlamaParseReader implements FileReader {
}
// Create a job for the LlamaParse API
private async createJob(file: string): Promise<string> {
private async createJob(data: Buffer): Promise<string> {
// Load data, set the mime type
const data = await fs.readFile(file);
const mimeType = await this.getMimeType(data);
if (this.verbose) {
console.log(`Starting load for file: ${file}`);
console.log(`Starting load for file with mimeType: ${mimeType}`);
}
const body = new FormData();
body.set("file", new Blob([data], { type: mimeType }), file);
body.set("file", new Blob([data], { type: mimeType }));
body.append("language", this.language);
body.append("parsing_instruction", this.parsingInstruction);
body.append("skip_diagonal_text", this.skipDiagonalText.toString());
......@@ -251,15 +251,12 @@ export class LlamaParseReader implements FileReader {
* Loads data from a file and returns an array of Document objects.
* To be used with resultType = "text" and "markdown"
*
* @param {string} file - The path to the file to be loaded.
* @param {Buffer} fileContent - The content of the file to be loaded.
* @return {Promise<Document[]>} A Promise object that resolves to an array of Document objects.
*/
async loadData(file: string): Promise<Document[]> {
// Set metadata to contain file_path
const metadata = { file_path: file };
async loadDataAsContent(fileContent: Buffer): Promise<Document[]> {
// Creates a job for the file
const jobId = await this.createJob(file);
const jobId = await this.createJob(fileContent);
if (this.verbose) {
console.log(`Started parsing the file under job id ${jobId}`);
}
......@@ -269,7 +266,6 @@ export class LlamaParseReader implements FileReader {
return [
new Document({
text: resultJson[this.resultType],
metadata: metadata,
}),
];
}
......@@ -281,8 +277,9 @@ export class LlamaParseReader implements FileReader {
* @return {Promise<Record<string, any>>} A Promise that resolves to the JSON object.
*/
async loadJson(file: string): Promise<Record<string, any>> {
const data = await fs.readFile(file);
// Creates a job for the file
const jobId = await this.createJob(file);
const jobId = await this.createJob(data);
if (this.verbose) {
console.log(`Started parsing the file under job id ${jobId}`);
}
......
import { fs } from "@llamaindex/env";
import { Document } from "../Node.js";
import type { FileReader } from "./type.js";
import { FileReader } from "./type.js";
type MarkdownTuple = [string | null, string];
......@@ -8,7 +7,7 @@ type MarkdownTuple = [string | null, string];
* Extract text from markdown files.
* Returns dictionary with keys as headers and values as the text between headers.
*/
export class MarkdownReader implements FileReader {
export class MarkdownReader extends FileReader {
private _removeHyperlinks: boolean;
private _removeImages: boolean;
......@@ -17,6 +16,7 @@ export class MarkdownReader implements FileReader {
* @param {boolean} [removeImages=true] - Indicates whether images should be removed.
*/
constructor(removeHyperlinks: boolean = true, removeImages: boolean = true) {
super();
this._removeHyperlinks = removeHyperlinks;
this._removeImages = removeImages;
}
......@@ -89,18 +89,17 @@ export class MarkdownReader implements FileReader {
return this.markdownToTups(modifiedContent);
}
async loadData(file: string): Promise<Document[]> {
const content = await fs.readFile(file, "utf-8");
async loadDataAsContent(fileContent: Buffer): Promise<Document[]> {
const content = fileContent.toString("utf-8");
const tups = this.parseTups(content);
const results: Document[] = [];
let counter = 0;
for (const [header, value] of tups) {
const id_ = `${file}_${counter}`;
if (header) {
const text = `\n\n${header}\n${value}`;
results.push(new Document({ text, id_ }));
results.push(new Document({ text }));
} else {
results.push(new Document({ text: value, id_ }));
results.push(new Document({ text: value }));
}
counter += 1;
}
......
import { fs } from "@llamaindex/env";
import { Document } from "../Node.js";
import type { FileReader } from "./type.js";
import { FileReader } from "./type.js";
/**
* Read the text of a PDF
*/
export class PDFReader implements FileReader {
async loadData(file: string): Promise<Document[]> {
const content = await fs.readFile(file);
const pages = await readPDF(content);
export class PDFReader extends FileReader {
async loadDataAsContent(fileContent: Buffer): Promise<Document[]> {
const pages = await readPDF(fileContent);
return pages.map((text, page) => {
const id_ = `${file}_${page + 1}`;
const metadata = {
page_number: page + 1,
};
return new Document({ text, id_, metadata });
return new Document({ text, metadata });
});
}
}
......
import { path } from "@llamaindex/env";
import { Document, type Metadata } from "../Node.js";
import { Document } from "../Node.js";
import { walk } from "../storage/FileSystem.js";
import { TextFileReader } from "./TextFileReader.js";
import type { BaseReader, FileReader } from "./type.js";
......@@ -136,7 +136,6 @@ export class SimpleDirectoryReader implements BaseReader {
}
const fileDocs = await reader.loadData(filePath);
fileDocs.forEach(addMetaData(filePath));
// Observer can still cancel addition of the resulting docs from this file
if (this.doObserverCheck("file", filePath, ReaderStatus.COMPLETE)) {
......@@ -167,10 +166,3 @@ export class SimpleDirectoryReader implements BaseReader {
return true;
}
}
function addMetaData(filePath: string): (doc: Document<Metadata>) => void {
return (doc: Document<Metadata>) => {
doc.metadata["file_path"] = path.resolve(filePath);
doc.metadata["file_name"] = path.basename(filePath);
};
}
import { fs } from "@llamaindex/env";
import { Document } from "../Node.js";
import type { FileReader } from "./type.js";
import { FileReader } from "./type.js";
/**
* Read a .txt file
*/
export class TextFileReader implements FileReader {
async loadData(file: string): Promise<Document[]> {
const dataBuffer = await fs.readFile(file, "utf-8");
return [new Document({ text: dataBuffer, id_: file })];
export class TextFileReader extends FileReader {
async loadDataAsContent(fileContent: Buffer): Promise<Document[]> {
const dataBuffer = fileContent.toString("utf-8");
return [new Document({ text: dataBuffer })];
}
}
import { fs, path } from "@llamaindex/env";
import type { Document } from "../Node.js";
/**
......@@ -8,10 +9,26 @@ export interface BaseReader {
}
/**
* A reader takes file paths and imports data into Document objects.
* A FileReader takes file paths and imports data into Document objects.
*/
export interface FileReader extends BaseReader {
loadData(filePath: string): Promise<Document[]>;
export abstract class FileReader implements BaseReader {
abstract loadDataAsContent(fileContent: Buffer): Promise<Document[]>;
async loadData(filePath: string): Promise<Document[]> {
const fileContent = await fs.readFile(filePath);
const docs = await this.loadDataAsContent(fileContent);
docs.forEach(FileReader.addMetaData(filePath));
return docs;
}
static addMetaData(filePath: string) {
return (doc: Document, index: number) => {
// generate id as loadDataAsContent is only responsible for the content
doc.id_ = `${filePath}_${index + 1}`;
doc.metadata["file_path"] = path.resolve(filePath);
doc.metadata["file_name"] = path.basename(filePath);
};
}
}
// For LlamaParseReader.ts
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment