From 345300f11003d8d929055ce69cc0b27d8a13737c Mon Sep 17 00:00:00 2001 From: Fabian Wimmer <github@insightby.ai> Date: Mon, 29 Jul 2024 11:16:46 +0200 Subject: [PATCH] feat: add split by page mode to LlamaParseReader (#924) --- .changeset/chilled-tomatoes-visit.md | 5 ++++ .../data_loaders/llama_parse/index.mdx | 1 + .../src/readers/LlamaParseReader.ts | 24 +++++++++++++++++-- 3 files changed, 28 insertions(+), 2 deletions(-) create mode 100644 .changeset/chilled-tomatoes-visit.md diff --git a/.changeset/chilled-tomatoes-visit.md b/.changeset/chilled-tomatoes-visit.md new file mode 100644 index 000000000..4a19d366c --- /dev/null +++ b/.changeset/chilled-tomatoes-visit.md @@ -0,0 +1,5 @@ +--- +"llamaindex": patch +--- + +feat: add splitByPage mode to LlamaParseReader diff --git a/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx b/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx index 0efff3713..80bafbd0e 100644 --- a/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx +++ b/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx @@ -48,6 +48,7 @@ They can be divided into two groups. - `gpt4oApiKey?` Deprecated. Use vendorMultimodal params. Optional. Set the GPT-4o API key. Lowers the cost of parsing by using your own API key. Your OpenAI account will be charged. Can also be set in the environment variable `LLAMA_CLOUD_GPT4O_API_KEY`. - `boundingBox?` Optional. Specify an area of the document to parse. Expects the bounding box margins as a string in clockwise order, e.g. `boundingBox = "0.1,0,0,0"` to not parse the top 10% of the document. - `targetPages?` Optional. Specify which pages to parse by specifying them as a comma-separated list. First page is `0`. +- `splitByPage` Wether to split the results, creating one document per page. Uses the set `pageSeparator` or `\n---\n` as fallback. Default is true. - `useVendorMultimodalModel` set to true to use a multimodal model. Default is `false`. - `vendorMultimodalModel?` Optional. Specify which multimodal model to use. Default is GPT4o. See [here](https://docs.cloud.llamaindex.ai/llamaparse/features/multimodal) for a list of available models and cost. - `vendorMultimodalApiKey?` Optional. Set the multimodal model API key. Can also be set in the environment variable `LLAMA_CLOUD_VENDOR_MULTIMODAL_API_KEY`. diff --git a/packages/llamaindex/src/readers/LlamaParseReader.ts b/packages/llamaindex/src/readers/LlamaParseReader.ts index 51e3d2e81..6532ac5cc 100644 --- a/packages/llamaindex/src/readers/LlamaParseReader.ts +++ b/packages/llamaindex/src/readers/LlamaParseReader.ts @@ -143,6 +143,8 @@ export class LlamaParseReader extends FileReader { targetPages?: string; // Whether or not to ignore and skip errors raised during parsing. ignoreErrors: boolean = true; + // Whether to split by page using the pageSeparator or '\n---\n' as default. + splitByPage: boolean = true; // Whether to use the vendor multimodal API. useVendorMultimodalModel: boolean = false; // The model name for the vendor multimodal API @@ -326,10 +328,17 @@ export class LlamaParseReader extends FileReader { } // Return results as Document objects - const resultJson = await this.getJobResult(jobId, this.resultType); + const jobResults = await this.getJobResult(jobId, this.resultType); + const resultText = jobResults[this.resultType]; + + // Split the text by separator if splitByPage is true + if (this.splitByPage) { + return this.splitTextBySeparator(resultText); + } + return [ new Document({ - text: resultJson[this.resultType], + text: resultText, }), ]; } catch (e) { @@ -485,6 +494,17 @@ export class LlamaParseReader extends FileReader { return filteredParams; } + private splitTextBySeparator(text: string): Document[] { + const separator = this.pageSeparator ?? "\n---\n"; + const textChunks = text.split(separator); + return textChunks.map( + (docChunk: string) => + new Document({ + text: docChunk, + }), + ); + } + static async getMimeType( data: Uint8Array, ): Promise<{ mime: string; extension: string }> { -- GitLab