From fb6db454d416d8fdb0eceaaf81ba2952d2f3889e Mon Sep 17 00:00:00 2001
From: Fabian Wimmer <github@insightby.ai>
Date: Thu, 25 Jul 2024 10:09:43 +0200
Subject: [PATCH] feat: update/add pageSeparator params to LlamaParseReader
 (#1051)

---
 .changeset/weak-news-train.md                 |  5 +++
 .../docs/modules/data_loaders/discord.mdx     |  2 +-
 apps/docs/docs/modules/data_loaders/index.mdx | 10 ++++-
 .../data_loaders/llama_parse/index.mdx        |  4 +-
 .../src/readers/LlamaParseReader.ts           | 37 ++++++++++++++++++-
 5 files changed, 53 insertions(+), 5 deletions(-)
 create mode 100644 .changeset/weak-news-train.md

diff --git a/.changeset/weak-news-train.md b/.changeset/weak-news-train.md
new file mode 100644
index 000000000..f8a8794db
--- /dev/null
+++ b/.changeset/weak-news-train.md
@@ -0,0 +1,5 @@
+---
+"llamaindex": patch
+---
+
+feat: add pageSeparator params to LlamaParseReader
diff --git a/apps/docs/docs/modules/data_loaders/discord.mdx b/apps/docs/docs/modules/data_loaders/discord.mdx
index 756123532..ffb4c312e 100644
--- a/apps/docs/docs/modules/data_loaders/discord.mdx
+++ b/apps/docs/docs/modules/data_loaders/discord.mdx
@@ -20,7 +20,7 @@ Copy the URL in your browser and select the server you want your bot to join.
 #### DiscordReader()
 
 - `discordToken?`: The Discord bot token.
-- `makeRequest?`: Optionally provide a custom request function for edge environments, e.g. `fetch`. See discord.js for more info.
+- `requestHandler?`: Optionally provide a custom request function for edge environments, e.g. `fetch`. See discord.js for more info.
 
 #### DiscordReader.loadData
 
diff --git a/apps/docs/docs/modules/data_loaders/index.mdx b/apps/docs/docs/modules/data_loaders/index.mdx
index 710515ebd..2fbf9bc8e 100644
--- a/apps/docs/docs/modules/data_loaders/index.mdx
+++ b/apps/docs/docs/modules/data_loaders/index.mdx
@@ -16,7 +16,15 @@ It is a simple reader that reads all files from a directory and its subdirectori
 
 <CodeBlock language="ts">{CodeSource}</CodeBlock>
 
-Currently, it supports reading `.txt`, `.pdf`, `.csv`, `.md`, `.docx`, `.htm`, `.html`, `.jpg`, `.jpeg`, `.png` and `.gif` files, but support for other file types is planned.
+Currently, the following readers are mapped to specific file types:
+
+- [TextFileReader](../../api/classes/TextFileReader.md): `.txt`
+- [PDFReader](../../api/classes/PDFReader.md): `.pdf`
+- [PapaCSVReader](../../api/classes/PapaCSVReader.md): `.csv`
+- [MarkdownReader](../../api/classes/MarkdownReader.md): `.md`
+- [DocxReader](../../api/classes/DocxReader.md): `.docx`
+- [HTMLReader](../../api/classes/HTMLReader.md): `.htm`, `.html`
+- [ImageReader](../../api/classes/ImageReader.md): `.jpg`, `.jpeg`, `.png`, `.gif`
 
 You can modify the reader three different ways:
 
diff --git a/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx b/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx
index a6f6c2750..0efff3713 100644
--- a/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx
+++ b/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx
@@ -41,7 +41,9 @@ They can be divided into two groups.
 - `doNotCache?` Optional. Set to true to not cache the document.
 - `fastMode?` Optional. Set to true to use the fast mode. This mode will skip OCR of images, and table/heading reconstruction. Note: Non-compatible with `gpt4oMode`.
 - `doNotUnrollColumns?` Optional. Set to true to keep the text according to document layout. Reduce reconstruction accuracy, and LLMs/embeddings performances in most cases.
-- `pageSeparator?` Optional. The page separator to use. Defaults is `\\n---\\n`.
+- `pageSeparator?` Optional. A templated page separator to use to split the text. If the results contain `{page_number}` (e.g. JSON mode), it will be replaced by the next page number. If not set the default separator `\\n---\\n` will be used.
+- `pagePrefix?` Optional. A templated prefix to add to the beginning of each page. If the results contain `{page_number}`, it will be replaced by the page number.
+- `pageSuffix?` Optional. A templated suffix to add to the end of each page. If the results contain `{page_number}`, it will be replaced by the page number.
 - `gpt4oMode` Deprecated. Use vendorMultimodal params. Set to true to use GPT-4o to extract content. Default is `false`.
 - `gpt4oApiKey?` Deprecated. Use vendorMultimodal params. Optional. Set the GPT-4o API key. Lowers the cost of parsing by using your own API key. Your OpenAI account will be charged. Can also be set in the environment variable `LLAMA_CLOUD_GPT4O_API_KEY`.
 - `boundingBox?` Optional. Specify an area of the document to parse. Expects the bounding box margins as a string in clockwise order, e.g. `boundingBox = "0.1,0,0,0"` to not parse the top 10% of the document.
diff --git a/packages/llamaindex/src/readers/LlamaParseReader.ts b/packages/llamaindex/src/readers/LlamaParseReader.ts
index bc9673c71..51e3d2e81 100644
--- a/packages/llamaindex/src/readers/LlamaParseReader.ts
+++ b/packages/llamaindex/src/readers/LlamaParseReader.ts
@@ -127,8 +127,12 @@ export class LlamaParseReader extends FileReader {
   fastMode?: boolean;
   // Wether to keep column in the text according to document layout. Reduce reconstruction accuracy, and LLM's/embedings performances in most cases.
   doNotUnrollColumns?: boolean;
-  // The page separator to use to split the text. Default is None, which means the parser will use the default separator '\\n---\\n'.
+  // A templated page separator to use to split the text. If the results contain `{page_number}` (e.g. JSON mode), it will be replaced by the next page number. If not set the default separator '\\n---\\n' will be used.
   pageSeparator?: string;
+  //A templated prefix to add to the beginning of each page. If the results contain `{page_number}`, it will be replaced by the page number.>
+  pagePrefix?: string;
+  // A templated suffix to add to the end of each page. If the results contain `{page_number}`, it will be replaced by the page number.
+  pageSuffix?: string;
   // Deprecated. Use vendorMultimodal params. Whether to use gpt-4o to extract text from documents.
   gpt4oMode: boolean = false;
   // Deprecated. Use vendorMultimodal params. The API key for the GPT-4o API. Optional, lowers the cost of parsing. Can be set as an env variable: LLAMA_CLOUD_GPT4O_API_KEY.
@@ -198,6 +202,8 @@ export class LlamaParseReader extends FileReader {
       fast_mode: this.fastMode?.toString(),
       do_not_unroll_columns: this.doNotUnrollColumns?.toString(),
       page_separator: this.pageSeparator,
+      page_prefix: this.pagePrefix,
+      page_suffix: this.pageSuffix,
       gpt4o_mode: this.gpt4oMode?.toString(),
       gpt4o_api_key: this.gpt4oApiKey,
       bounding_box: this.boundingBox,
@@ -207,8 +213,17 @@ export class LlamaParseReader extends FileReader {
       vendor_multimodal_api_key: this.vendorMultimodalApiKey,
     };
 
+    // Filter out params with invalid values that would cause issues on the backend.
+    const filteredParams = this.filterSpecificParams(LlamaParseBodyParams, [
+      "page_separator",
+      "page_prefix",
+      "page_suffix",
+      "bounding_box",
+      "target_pages",
+    ]);
+
     // Appends body with any defined LlamaParseBodyParams
-    Object.entries(LlamaParseBodyParams).forEach(([key, value]) => {
+    Object.entries(filteredParams).forEach(([key, value]) => {
       if (value !== undefined) {
         body.append(key, value);
       }
@@ -452,6 +467,24 @@ export class LlamaParseReader extends FileReader {
     await fs.writeFile(imagePath, buffer);
   }
 
+  // Filters out invalid values (null, undefined, empty string) of specific params.
+  private filterSpecificParams(
+    params: Record<string, any>,
+    keysToCheck: string[],
+  ): Record<string, any> {
+    const filteredParams: Record<string, any> = {};
+    for (const [key, value] of Object.entries(params)) {
+      if (keysToCheck.includes(key)) {
+        if (value !== null && value !== undefined && value !== "") {
+          filteredParams[key] = value;
+        }
+      } else {
+        filteredParams[key] = value;
+      }
+    }
+    return filteredParams;
+  }
+
   static async getMimeType(
     data: Uint8Array,
   ): Promise<{ mime: string; extension: string }> {
-- 
GitLab