From 345300f11003d8d929055ce69cc0b27d8a13737c Mon Sep 17 00:00:00 2001
From: Fabian Wimmer <github@insightby.ai>
Date: Mon, 29 Jul 2024 11:16:46 +0200
Subject: [PATCH] feat: add split by page mode to LlamaParseReader (#924)

---
 .changeset/chilled-tomatoes-visit.md          |  5 ++++
 .../data_loaders/llama_parse/index.mdx        |  1 +
 .../src/readers/LlamaParseReader.ts           | 24 +++++++++++++++++--
 3 files changed, 28 insertions(+), 2 deletions(-)
 create mode 100644 .changeset/chilled-tomatoes-visit.md

diff --git a/.changeset/chilled-tomatoes-visit.md b/.changeset/chilled-tomatoes-visit.md
new file mode 100644
index 000000000..4a19d366c
--- /dev/null
+++ b/.changeset/chilled-tomatoes-visit.md
@@ -0,0 +1,5 @@
+---
+"llamaindex": patch
+---
+
+feat: add splitByPage mode to LlamaParseReader
diff --git a/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx b/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx
index 0efff3713..80bafbd0e 100644
--- a/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx
+++ b/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx
@@ -48,6 +48,7 @@ They can be divided into two groups.
 - `gpt4oApiKey?` Deprecated. Use vendorMultimodal params. Optional. Set the GPT-4o API key. Lowers the cost of parsing by using your own API key. Your OpenAI account will be charged. Can also be set in the environment variable `LLAMA_CLOUD_GPT4O_API_KEY`.
 - `boundingBox?` Optional. Specify an area of the document to parse. Expects the bounding box margins as a string in clockwise order, e.g. `boundingBox = "0.1,0,0,0"` to not parse the top 10% of the document.
 - `targetPages?` Optional. Specify which pages to parse by specifying them as a comma-separated list. First page is `0`.
+- `splitByPage` Wether to split the results, creating one document per page. Uses the set `pageSeparator` or `\n---\n` as fallback. Default is true.
 - `useVendorMultimodalModel` set to true to use a multimodal model. Default is `false`.
 - `vendorMultimodalModel?` Optional. Specify which multimodal model to use. Default is GPT4o. See [here](https://docs.cloud.llamaindex.ai/llamaparse/features/multimodal) for a list of available models and cost.
 - `vendorMultimodalApiKey?` Optional. Set the multimodal model API key. Can also be set in the environment variable `LLAMA_CLOUD_VENDOR_MULTIMODAL_API_KEY`.
diff --git a/packages/llamaindex/src/readers/LlamaParseReader.ts b/packages/llamaindex/src/readers/LlamaParseReader.ts
index 51e3d2e81..6532ac5cc 100644
--- a/packages/llamaindex/src/readers/LlamaParseReader.ts
+++ b/packages/llamaindex/src/readers/LlamaParseReader.ts
@@ -143,6 +143,8 @@ export class LlamaParseReader extends FileReader {
   targetPages?: string;
   // Whether or not to ignore and skip errors raised during parsing.
   ignoreErrors: boolean = true;
+  // Whether to split by page using the pageSeparator or '\n---\n' as default.
+  splitByPage: boolean = true;
   // Whether to use the vendor multimodal API.
   useVendorMultimodalModel: boolean = false;
   // The model name for the vendor multimodal API
@@ -326,10 +328,17 @@ export class LlamaParseReader extends FileReader {
       }
 
       // Return results as Document objects
-      const resultJson = await this.getJobResult(jobId, this.resultType);
+      const jobResults = await this.getJobResult(jobId, this.resultType);
+      const resultText = jobResults[this.resultType];
+
+      // Split the text by separator if splitByPage is true
+      if (this.splitByPage) {
+        return this.splitTextBySeparator(resultText);
+      }
+
       return [
         new Document({
-          text: resultJson[this.resultType],
+          text: resultText,
         }),
       ];
     } catch (e) {
@@ -485,6 +494,17 @@ export class LlamaParseReader extends FileReader {
     return filteredParams;
   }
 
+  private splitTextBySeparator(text: string): Document[] {
+    const separator = this.pageSeparator ?? "\n---\n";
+    const textChunks = text.split(separator);
+    return textChunks.map(
+      (docChunk: string) =>
+        new Document({
+          text: docChunk,
+        }),
+    );
+  }
+
   static async getMimeType(
     data: Uint8Array,
   ): Promise<{ mime: string; extension: string }> {
-- 
GitLab