diff --git a/.changeset/chilled-tomatoes-visit.md b/.changeset/chilled-tomatoes-visit.md new file mode 100644 index 0000000000000000000000000000000000000000..4a19d366c143854570d83cb5184f3f6e4e2ee2ef --- /dev/null +++ b/.changeset/chilled-tomatoes-visit.md @@ -0,0 +1,5 @@ +--- +"llamaindex": patch +--- + +feat: add splitByPage mode to LlamaParseReader diff --git a/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx b/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx index 0efff3713c630137f5384643ab436ba766e5a7bc..80bafbd0e0a898c2d7bcdae282df024dda307812 100644 --- a/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx +++ b/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx @@ -48,6 +48,7 @@ They can be divided into two groups. - `gpt4oApiKey?` Deprecated. Use vendorMultimodal params. Optional. Set the GPT-4o API key. Lowers the cost of parsing by using your own API key. Your OpenAI account will be charged. Can also be set in the environment variable `LLAMA_CLOUD_GPT4O_API_KEY`. - `boundingBox?` Optional. Specify an area of the document to parse. Expects the bounding box margins as a string in clockwise order, e.g. `boundingBox = "0.1,0,0,0"` to not parse the top 10% of the document. - `targetPages?` Optional. Specify which pages to parse by specifying them as a comma-separated list. First page is `0`. +- `splitByPage` Wether to split the results, creating one document per page. Uses the set `pageSeparator` or `\n---\n` as fallback. Default is true. - `useVendorMultimodalModel` set to true to use a multimodal model. Default is `false`. - `vendorMultimodalModel?` Optional. Specify which multimodal model to use. Default is GPT4o. See [here](https://docs.cloud.llamaindex.ai/llamaparse/features/multimodal) for a list of available models and cost. - `vendorMultimodalApiKey?` Optional. Set the multimodal model API key. Can also be set in the environment variable `LLAMA_CLOUD_VENDOR_MULTIMODAL_API_KEY`. diff --git a/packages/llamaindex/src/readers/LlamaParseReader.ts b/packages/llamaindex/src/readers/LlamaParseReader.ts index 51e3d2e81a16335c443784cbe0aa89cb48e6c544..6532ac5ccf4703762981b4a31981144e162b84ae 100644 --- a/packages/llamaindex/src/readers/LlamaParseReader.ts +++ b/packages/llamaindex/src/readers/LlamaParseReader.ts @@ -143,6 +143,8 @@ export class LlamaParseReader extends FileReader { targetPages?: string; // Whether or not to ignore and skip errors raised during parsing. ignoreErrors: boolean = true; + // Whether to split by page using the pageSeparator or '\n---\n' as default. + splitByPage: boolean = true; // Whether to use the vendor multimodal API. useVendorMultimodalModel: boolean = false; // The model name for the vendor multimodal API @@ -326,10 +328,17 @@ export class LlamaParseReader extends FileReader { } // Return results as Document objects - const resultJson = await this.getJobResult(jobId, this.resultType); + const jobResults = await this.getJobResult(jobId, this.resultType); + const resultText = jobResults[this.resultType]; + + // Split the text by separator if splitByPage is true + if (this.splitByPage) { + return this.splitTextBySeparator(resultText); + } + return [ new Document({ - text: resultJson[this.resultType], + text: resultText, }), ]; } catch (e) { @@ -485,6 +494,17 @@ export class LlamaParseReader extends FileReader { return filteredParams; } + private splitTextBySeparator(text: string): Document[] { + const separator = this.pageSeparator ?? "\n---\n"; + const textChunks = text.split(separator); + return textChunks.map( + (docChunk: string) => + new Document({ + text: docChunk, + }), + ); + } + static async getMimeType( data: Uint8Array, ): Promise<{ mime: string; extension: string }> {