feat: add split by page mode to LlamaParseReader (#924)

345300f1 · Fabian Wimmer · GitHub · f322c5d2 · 345300f1 · 345300f1
Unverified Commit 345300f1 authored 9 months ago by Fabian Wimmer Committed by GitHub 9 months ago
--- a/.changeset/chilled-tomatoes-visit.md
+++ b/.changeset/chilled-tomatoes-visit.md
+---
+"llamaindex": patch
+---
+feat: add splitByPage mode to LlamaParseReader
--- a/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx
+++ b/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx
@@ -48,6 +48,7 @@ They can be divided into two groups.
 - `gpt4oApiKey?` Deprecated. Use vendorMultimodal params. Optional. Set the GPT-4o API key. Lowers the cost of parsing by using your own API key. Your OpenAI account will be charged. Can also be set in the environment variable `LLAMA_CLOUD_GPT4O_API_KEY`.
 - `boundingBox?` Optional. Specify an area of the document to parse. Expects the bounding box margins as a string in clockwise order, e.g. `boundingBox = "0.1,0,0,0"` to not parse the top 10% of the document.
 - `targetPages?` Optional. Specify which pages to parse by specifying them as a comma-separated list. First page is `0`.
+- `splitByPage` Wether to split the results, creating one document per page. Uses the set `pageSeparator` or `\n---\n` as fallback. Default is true.
 - `useVendorMultimodalModel` set to true to use a multimodal model. Default is `false`.
 - `vendorMultimodalModel?` Optional. Specify which multimodal model to use. Default is GPT4o. See [here](https://docs.cloud.llamaindex.ai/llamaparse/features/multimodal) for a list of available models and cost.
 - `vendorMultimodalApiKey?` Optional. Set the multimodal model API key. Can also be set in the environment variable `LLAMA_CLOUD_VENDOR_MULTIMODAL_API_KEY`.

--- a/packages/llamaindex/src/readers/LlamaParseReader.ts
+++ b/packages/llamaindex/src/readers/LlamaParseReader.ts
@@ -143,6 +143,8 @@ export class LlamaParseReader extends FileReader {
  targetPages?: string;
  // Whether or not to ignore and skip errors raised during parsing.
  ignoreErrors: boolean = true;
+  // Whether to split by page using the pageSeparator or '\n---\n' as default.
+  splitByPage: boolean = true;
  // Whether to use the vendor multimodal API.
  useVendorMultimodalModel: boolean = false;
  // The model name for the vendor multimodal API
@@ -326,10 +328,17 @@ export class LlamaParseReader extends FileReader {
      }
      // Return results as Document objects
-      const resultJson = await this.getJobResult(jobId, this.resultType);
+      const jobResults = await this.getJobResult(jobId, this.resultType);
+      const resultText = jobResults[this.resultType];
+      // Split the text by separator if splitByPage is true
+      if (this.splitByPage) {
+        return this.splitTextBySeparator(resultText);
+      }
      return [
        new Document({
-          text: resultJson[this.resultType],
+          text: resultText,
        }),
      ];
    } catch (e) {
@@ -485,6 +494,17 @@ export class LlamaParseReader extends FileReader {
    return filteredParams;
  }
+  private splitTextBySeparator(text: string): Document[] {
+    const separator = this.pageSeparator ?? "\n---\n";
+    const textChunks = text.split(separator);
+    return textChunks.map(
+      (docChunk: string) =>
+        new Document({
+          text: docChunk,
+        }),
+    );
+  }
  static async getMimeType(
    data: Uint8Array,
  ): Promise<{ mime: string; extension: string }> {