Skip to content
Snippets Groups Projects
Unverified Commit 304484b7 authored by Fabian Wimmer's avatar Fabian Wimmer Committed by GitHub
Browse files

feat: add ignoreErrors flag to LlamaParse (#959)

parent 29fed77d
Branches
Tags
No related merge requests found
---
"llamaindex": patch
---
feat: add ignoreErrors flag to LlamaParseReader
......@@ -133,6 +133,8 @@ export class LlamaParseReader extends FileReader {
gpt4oMode: boolean = false;
// The API key for the GPT-4o API. Optional, lowers the cost of parsing. Can be set as an env variable: LLAMA_CLOUD_GPT4O_API_KEY.
gpt4oApiKey?: string;
// Whether or not to ignore and skip errors raised during parsing.
ignoreErrors: boolean = true;
// numWorkers is implemented in SimpleDirectoryReader
constructor(params: Partial<LlamaParseReader> = {}) {
......@@ -278,19 +280,29 @@ export class LlamaParseReader extends FileReader {
fileContent: Uint8Array,
fileName?: string,
): Promise<Document[]> {
// Creates a job for the file
const jobId = await this.createJob(fileContent, fileName);
if (this.verbose) {
console.log(`Started parsing the file under job id ${jobId}`);
}
let jobId;
try {
// Creates a job for the file
jobId = await this.createJob(fileContent, fileName);
if (this.verbose) {
console.log(`Started parsing the file under job id ${jobId}`);
}
// Return results as Document objects
const resultJson = await this.getJobResult(jobId, this.resultType);
return [
new Document({
text: resultJson[this.resultType],
}),
];
// Return results as Document objects
const resultJson = await this.getJobResult(jobId, this.resultType);
return [
new Document({
text: resultJson[this.resultType],
}),
];
} catch (e) {
console.error(`Error while parsing file under job id ${jobId}`, e);
if (this.ignoreErrors) {
return [];
} else {
throw e;
}
}
}
/**
* Loads data from a file and returns an array of JSON objects.
......@@ -300,18 +312,28 @@ export class LlamaParseReader extends FileReader {
* @return {Promise<Record<string, any>[]>} A Promise that resolves to an array of JSON objects.
*/
async loadJson(file: string): Promise<Record<string, any>[]> {
const data = await fs.readFile(file);
// Creates a job for the file
const jobId = await this.createJob(data);
if (this.verbose) {
console.log(`Started parsing the file under job id ${jobId}`);
}
let jobId;
try {
const data = await fs.readFile(file);
// Creates a job for the file
jobId = await this.createJob(data);
if (this.verbose) {
console.log(`Started parsing the file under job id ${jobId}`);
}
// Return results as an array of JSON objects (same format as Python version of the reader)
const resultJson = await this.getJobResult(jobId, "json");
resultJson.job_id = jobId;
resultJson.file_path = file;
return [resultJson];
// Return results as an array of JSON objects (same format as Python version of the reader)
const resultJson = await this.getJobResult(jobId, "json");
resultJson.job_id = jobId;
resultJson.file_path = file;
return [resultJson];
} catch (e) {
console.error(`Error while parsing the file under job id ${jobId}`, e);
if (this.ignoreErrors) {
return [];
} else {
throw e;
}
}
}
/**
......@@ -326,51 +348,81 @@ export class LlamaParseReader extends FileReader {
jsonResult: Record<string, any>[],
downloadPath: string,
): Promise<Record<string, any>[]> {
const headers = { Authorization: `Bearer ${this.apiKey}` };
// Create download directory if it doesn't exist (Actually check for write access, not existence, since fsPromises does not have a `existsSync` method)
if (!fs.access(downloadPath)) {
await fs.mkdir(downloadPath, { recursive: true });
}
const images: Record<string, any>[] = [];
for (const result of jsonResult) {
const jobId = result.job_id;
for (const page of result.pages) {
if (this.verbose) {
console.log(`> Image for page ${page.page}: ${page.images}`);
}
for (const image of page.images) {
const imageName = image.name;
// Get the full path
let imagePath = `${downloadPath}/${jobId}-${imageName}`;
try {
// Create download directory if it doesn't exist (Actually check for write access, not existence, since fsPromises does not have a `existsSync` method)
try {
await fs.access(downloadPath);
} catch {
await fs.mkdir(downloadPath, { recursive: true });
}
if (!imagePath.endsWith(".png") && !imagePath.endsWith(".jpg")) {
imagePath += ".png";
const images: Record<string, any>[] = [];
for (const result of jsonResult) {
const jobId = result.job_id;
for (const page of result.pages) {
if (this.verbose) {
console.log(`> Image for page ${page.page}: ${page.images}`);
}
// Get a valid image path
image.path = imagePath;
image.job_id = jobId;
image.original_pdf_path = result.file_path;
image.page_number = page.page;
const imageUrl = `${this.baseUrl}/job/${jobId}/result/image/${imageName}`;
const response = await fetch(imageUrl, { headers });
if (!response.ok) {
throw new Error(
`Failed to download image: ${await response.text()}`,
for (const image of page.images) {
const imageName = image.name;
const imagePath = await this.getImagePath(
downloadPath,
jobId,
imageName,
);
await this.fetchAndSaveImage(imageName, imagePath, jobId);
// Assign metadata to the image
image.path = imagePath;
image.job_id = jobId;
image.original_pdf_path = result.file_path;
image.page_number = page.page;
images.push(image);
}
const arrayBuffer = await response.arrayBuffer();
const buffer = new Uint8Array(arrayBuffer);
await fs.writeFile(imagePath, buffer);
images.push(image);
}
}
return images;
} catch (e) {
console.error(`Error while downloading images from the parsed result`, e);
if (this.ignoreErrors) {
return [];
} else {
throw e;
}
}
}
private async getImagePath(
downloadPath: string,
jobId: string,
imageName: string,
): Promise<string> {
// Get the full path
let imagePath = `${downloadPath}/${jobId}-${imageName}`;
// Get a valid image path
if (!imagePath.endsWith(".png") && !imagePath.endsWith(".jpg")) {
imagePath += ".png";
}
return imagePath;
}
private async fetchAndSaveImage(
imageName: string,
imagePath: string,
jobId: string,
): Promise<void> {
const headers = { Authorization: `Bearer ${this.apiKey}` };
// Construct the image URL
const imageUrl = `${this.baseUrl}/job/${jobId}/result/image/${imageName}`;
const response = await fetch(imageUrl, { headers });
if (!response.ok) {
throw new Error(`Failed to download image: ${await response.text()}`);
}
return images;
// Convert the response to an ArrayBuffer and then to a Buffer
const arrayBuffer = await response.arrayBuffer();
const buffer = new Uint8Array(arrayBuffer);
// Write the image buffer to the specified imagePath
await fs.writeFile(imagePath, buffer);
}
static async getMimeType(
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment