From 1a65ead849d862f0d7e01f64b46cbc0fa7a763da Mon Sep 17 00:00:00 2001
From: Fabian Wimmer <github@insightby.ai>
Date: Tue, 16 Jul 2024 23:20:34 +0200
Subject: [PATCH] feat: add vendorMultiModal params to LlamaParseReader (#1042)

---
 .changeset/brown-zoos-nail.md                 |  5 +++++
 .../data_loaders/llama_parse/index.mdx        |  7 +++++--
 .../src/readers/LlamaParseReader.ts           | 20 +++++++++++++++++--
 3 files changed, 28 insertions(+), 4 deletions(-)
 create mode 100644 .changeset/brown-zoos-nail.md

diff --git a/.changeset/brown-zoos-nail.md b/.changeset/brown-zoos-nail.md
new file mode 100644
index 000000000..e5ec1e61f
--- /dev/null
+++ b/.changeset/brown-zoos-nail.md
@@ -0,0 +1,5 @@
+---
+"llamaindex": patch
+---
+
+feat: add vendorMultimodal params to LlamaParseReader
diff --git a/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx b/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx
index dbb1fcee4..a6f6c2750 100644
--- a/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx
+++ b/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx
@@ -42,10 +42,13 @@ They can be divided into two groups.
 - `fastMode?` Optional. Set to true to use the fast mode. This mode will skip OCR of images, and table/heading reconstruction. Note: Non-compatible with `gpt4oMode`.
 - `doNotUnrollColumns?` Optional. Set to true to keep the text according to document layout. Reduce reconstruction accuracy, and LLMs/embeddings performances in most cases.
 - `pageSeparator?` Optional. The page separator to use. Defaults is `\\n---\\n`.
-- `gpt4oMode` set to true to use GPT-4o to extract content. Default is `false`.
-- `gpt4oApiKey?` Optional. Set the GPT-4o API key. Lowers the cost of parsing by using your own API key. Your OpenAI account will be charged. Can also be set in the environment variable `LLAMA_CLOUD_GPT4O_API_KEY`.
+- `gpt4oMode` Deprecated. Use vendorMultimodal params. Set to true to use GPT-4o to extract content. Default is `false`.
+- `gpt4oApiKey?` Deprecated. Use vendorMultimodal params. Optional. Set the GPT-4o API key. Lowers the cost of parsing by using your own API key. Your OpenAI account will be charged. Can also be set in the environment variable `LLAMA_CLOUD_GPT4O_API_KEY`.
 - `boundingBox?` Optional. Specify an area of the document to parse. Expects the bounding box margins as a string in clockwise order, e.g. `boundingBox = "0.1,0,0,0"` to not parse the top 10% of the document.
 - `targetPages?` Optional. Specify which pages to parse by specifying them as a comma-separated list. First page is `0`.
+- `useVendorMultimodalModel` set to true to use a multimodal model. Default is `false`.
+- `vendorMultimodalModel?` Optional. Specify which multimodal model to use. Default is GPT4o. See [here](https://docs.cloud.llamaindex.ai/llamaparse/features/multimodal) for a list of available models and cost.
+- `vendorMultimodalApiKey?` Optional. Set the multimodal model API key. Can also be set in the environment variable `LLAMA_CLOUD_VENDOR_MULTIMODAL_API_KEY`.
 - `numWorkers` as in the python version, is set in `SimpleDirectoryReader`. Default is 1.
 
 ### LlamaParse with SimpleDirectoryReader
diff --git a/packages/llamaindex/src/readers/LlamaParseReader.ts b/packages/llamaindex/src/readers/LlamaParseReader.ts
index 5acaa4431..061211295 100644
--- a/packages/llamaindex/src/readers/LlamaParseReader.ts
+++ b/packages/llamaindex/src/readers/LlamaParseReader.ts
@@ -129,9 +129,9 @@ export class LlamaParseReader extends FileReader {
   doNotUnrollColumns?: boolean;
   // The page separator to use to split the text. Default is None, which means the parser will use the default separator '\\n---\\n'.
   pageSeparator?: string;
-  // Whether to use gpt-4o to extract text from documents.
+  // Deprecated. Use vendorMultimodal params. Whether to use gpt-4o to extract text from documents.
   gpt4oMode: boolean = false;
-  // The API key for the GPT-4o API. Optional, lowers the cost of parsing. Can be set as an env variable: LLAMA_CLOUD_GPT4O_API_KEY.
+  // Deprecated. Use vendorMultimodal params. The API key for the GPT-4o API. Optional, lowers the cost of parsing. Can be set as an env variable: LLAMA_CLOUD_GPT4O_API_KEY.
   gpt4oApiKey?: string;
   // The bounding box to use to extract text from documents. Describe as a string containing the bounding box margins.
   boundingBox?: string;
@@ -139,6 +139,12 @@ export class LlamaParseReader extends FileReader {
   targetPages?: string;
   // Whether or not to ignore and skip errors raised during parsing.
   ignoreErrors: boolean = true;
+  // Whether to use the vendor multimodal API.
+  useVendorMultimodalModel: boolean = false;
+  // The model name for the vendor multimodal API
+  vendorMultimodalModelName?: string;
+  // The API key for the multimodal API. Can also be set as an env variable: LLAMA_CLOUD_VENDOR_MULTIMODAL_API_KEY
+  vendorMultimodalApiKey?: string;
   // numWorkers is implemented in SimpleDirectoryReader
 
   constructor(params: Partial<LlamaParseReader> = {}) {
@@ -158,6 +164,13 @@ export class LlamaParseReader extends FileReader {
 
       this.gpt4oApiKey = params.gpt4oApiKey;
     }
+    if (params.useVendorMultimodalModel) {
+      params.vendorMultimodalApiKey =
+        params.vendorMultimodalApiKey ??
+        getEnv("LLAMA_CLOUD_VENDOR_MULTIMODAL_API_KEY");
+
+      this.vendorMultimodalApiKey = params.vendorMultimodalApiKey;
+    }
   }
 
   // Create a job for the LlamaParse API
@@ -189,6 +202,9 @@ export class LlamaParseReader extends FileReader {
       gpt4o_api_key: this.gpt4oApiKey,
       bounding_box: this.boundingBox,
       target_pages: this.targetPages,
+      use_vendor_multimodal_model: this.useVendorMultimodalModel?.toString(),
+      vendor_multimodal_model_name: this.vendorMultimodalModelName,
+      vendor_multimodal_api_key: this.vendorMultimodalApiKey,
     };
 
     // Appends body with any defined LlamaParseBodyParams
-- 
GitLab