diff --git a/.changeset/tasty-bears-joke.md b/.changeset/tasty-bears-joke.md new file mode 100644 index 0000000000000000000000000000000000000000..b156d86acbe231d846bb93faf7bbc8cb64f74b01 --- /dev/null +++ b/.changeset/tasty-bears-joke.md @@ -0,0 +1,5 @@ +--- +"llamaindex": patch +--- + +feat: add ignoreErrors flag to LlamaParseReader diff --git a/packages/llamaindex/src/readers/LlamaParseReader.ts b/packages/llamaindex/src/readers/LlamaParseReader.ts index cc30280279c6823500b4d40514104eb8abb03113..ef63fa02a12b9106eb9b0c8d17d695ec1c0b29ce 100644 --- a/packages/llamaindex/src/readers/LlamaParseReader.ts +++ b/packages/llamaindex/src/readers/LlamaParseReader.ts @@ -133,6 +133,8 @@ export class LlamaParseReader extends FileReader { gpt4oMode: boolean = false; // The API key for the GPT-4o API. Optional, lowers the cost of parsing. Can be set as an env variable: LLAMA_CLOUD_GPT4O_API_KEY. gpt4oApiKey?: string; + // Whether or not to ignore and skip errors raised during parsing. + ignoreErrors: boolean = true; // numWorkers is implemented in SimpleDirectoryReader constructor(params: Partial<LlamaParseReader> = {}) { @@ -278,19 +280,29 @@ export class LlamaParseReader extends FileReader { fileContent: Uint8Array, fileName?: string, ): Promise<Document[]> { - // Creates a job for the file - const jobId = await this.createJob(fileContent, fileName); - if (this.verbose) { - console.log(`Started parsing the file under job id ${jobId}`); - } + let jobId; + try { + // Creates a job for the file + jobId = await this.createJob(fileContent, fileName); + if (this.verbose) { + console.log(`Started parsing the file under job id ${jobId}`); + } - // Return results as Document objects - const resultJson = await this.getJobResult(jobId, this.resultType); - return [ - new Document({ - text: resultJson[this.resultType], - }), - ]; + // Return results as Document objects + const resultJson = await this.getJobResult(jobId, this.resultType); + return [ + new Document({ + text: resultJson[this.resultType], + }), + ]; + } catch (e) { + console.error(`Error while parsing file under job id ${jobId}`, e); + if (this.ignoreErrors) { + return []; + } else { + throw e; + } + } } /** * Loads data from a file and returns an array of JSON objects. @@ -300,18 +312,28 @@ export class LlamaParseReader extends FileReader { * @return {Promise<Record<string, any>[]>} A Promise that resolves to an array of JSON objects. */ async loadJson(file: string): Promise<Record<string, any>[]> { - const data = await fs.readFile(file); - // Creates a job for the file - const jobId = await this.createJob(data); - if (this.verbose) { - console.log(`Started parsing the file under job id ${jobId}`); - } + let jobId; + try { + const data = await fs.readFile(file); + // Creates a job for the file + jobId = await this.createJob(data); + if (this.verbose) { + console.log(`Started parsing the file under job id ${jobId}`); + } - // Return results as an array of JSON objects (same format as Python version of the reader) - const resultJson = await this.getJobResult(jobId, "json"); - resultJson.job_id = jobId; - resultJson.file_path = file; - return [resultJson]; + // Return results as an array of JSON objects (same format as Python version of the reader) + const resultJson = await this.getJobResult(jobId, "json"); + resultJson.job_id = jobId; + resultJson.file_path = file; + return [resultJson]; + } catch (e) { + console.error(`Error while parsing the file under job id ${jobId}`, e); + if (this.ignoreErrors) { + return []; + } else { + throw e; + } + } } /** @@ -326,51 +348,81 @@ export class LlamaParseReader extends FileReader { jsonResult: Record<string, any>[], downloadPath: string, ): Promise<Record<string, any>[]> { - const headers = { Authorization: `Bearer ${this.apiKey}` }; - - // Create download directory if it doesn't exist (Actually check for write access, not existence, since fsPromises does not have a `existsSync` method) - if (!fs.access(downloadPath)) { - await fs.mkdir(downloadPath, { recursive: true }); - } - - const images: Record<string, any>[] = []; - for (const result of jsonResult) { - const jobId = result.job_id; - for (const page of result.pages) { - if (this.verbose) { - console.log(`> Image for page ${page.page}: ${page.images}`); - } - for (const image of page.images) { - const imageName = image.name; - // Get the full path - let imagePath = `${downloadPath}/${jobId}-${imageName}`; + try { + // Create download directory if it doesn't exist (Actually check for write access, not existence, since fsPromises does not have a `existsSync` method) + try { + await fs.access(downloadPath); + } catch { + await fs.mkdir(downloadPath, { recursive: true }); + } - if (!imagePath.endsWith(".png") && !imagePath.endsWith(".jpg")) { - imagePath += ".png"; + const images: Record<string, any>[] = []; + for (const result of jsonResult) { + const jobId = result.job_id; + for (const page of result.pages) { + if (this.verbose) { + console.log(`> Image for page ${page.page}: ${page.images}`); } - - // Get a valid image path - image.path = imagePath; - image.job_id = jobId; - image.original_pdf_path = result.file_path; - image.page_number = page.page; - - const imageUrl = `${this.baseUrl}/job/${jobId}/result/image/${imageName}`; - const response = await fetch(imageUrl, { headers }); - if (!response.ok) { - throw new Error( - `Failed to download image: ${await response.text()}`, + for (const image of page.images) { + const imageName = image.name; + const imagePath = await this.getImagePath( + downloadPath, + jobId, + imageName, ); + await this.fetchAndSaveImage(imageName, imagePath, jobId); + // Assign metadata to the image + image.path = imagePath; + image.job_id = jobId; + image.original_pdf_path = result.file_path; + image.page_number = page.page; + images.push(image); } - const arrayBuffer = await response.arrayBuffer(); - const buffer = new Uint8Array(arrayBuffer); - await fs.writeFile(imagePath, buffer); - - images.push(image); } } + return images; + } catch (e) { + console.error(`Error while downloading images from the parsed result`, e); + if (this.ignoreErrors) { + return []; + } else { + throw e; + } + } + } + + private async getImagePath( + downloadPath: string, + jobId: string, + imageName: string, + ): Promise<string> { + // Get the full path + let imagePath = `${downloadPath}/${jobId}-${imageName}`; + // Get a valid image path + if (!imagePath.endsWith(".png") && !imagePath.endsWith(".jpg")) { + imagePath += ".png"; + } + + return imagePath; + } + + private async fetchAndSaveImage( + imageName: string, + imagePath: string, + jobId: string, + ): Promise<void> { + const headers = { Authorization: `Bearer ${this.apiKey}` }; + // Construct the image URL + const imageUrl = `${this.baseUrl}/job/${jobId}/result/image/${imageName}`; + const response = await fetch(imageUrl, { headers }); + if (!response.ok) { + throw new Error(`Failed to download image: ${await response.text()}`); } - return images; + // Convert the response to an ArrayBuffer and then to a Buffer + const arrayBuffer = await response.arrayBuffer(); + const buffer = new Uint8Array(arrayBuffer); + // Write the image buffer to the specified imagePath + await fs.writeFile(imagePath, buffer); } static async getMimeType(