From 1a65ead849d862f0d7e01f64b46cbc0fa7a763da Mon Sep 17 00:00:00 2001 From: Fabian Wimmer <github@insightby.ai> Date: Tue, 16 Jul 2024 23:20:34 +0200 Subject: [PATCH] feat: add vendorMultiModal params to LlamaParseReader (#1042) --- .changeset/brown-zoos-nail.md | 5 +++++ .../data_loaders/llama_parse/index.mdx | 7 +++++-- .../src/readers/LlamaParseReader.ts | 20 +++++++++++++++++-- 3 files changed, 28 insertions(+), 4 deletions(-) create mode 100644 .changeset/brown-zoos-nail.md diff --git a/.changeset/brown-zoos-nail.md b/.changeset/brown-zoos-nail.md new file mode 100644 index 000000000..e5ec1e61f --- /dev/null +++ b/.changeset/brown-zoos-nail.md @@ -0,0 +1,5 @@ +--- +"llamaindex": patch +--- + +feat: add vendorMultimodal params to LlamaParseReader diff --git a/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx b/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx index dbb1fcee4..a6f6c2750 100644 --- a/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx +++ b/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx @@ -42,10 +42,13 @@ They can be divided into two groups. - `fastMode?` Optional. Set to true to use the fast mode. This mode will skip OCR of images, and table/heading reconstruction. Note: Non-compatible with `gpt4oMode`. - `doNotUnrollColumns?` Optional. Set to true to keep the text according to document layout. Reduce reconstruction accuracy, and LLMs/embeddings performances in most cases. - `pageSeparator?` Optional. The page separator to use. Defaults is `\\n---\\n`. -- `gpt4oMode` set to true to use GPT-4o to extract content. Default is `false`. -- `gpt4oApiKey?` Optional. Set the GPT-4o API key. Lowers the cost of parsing by using your own API key. Your OpenAI account will be charged. Can also be set in the environment variable `LLAMA_CLOUD_GPT4O_API_KEY`. +- `gpt4oMode` Deprecated. Use vendorMultimodal params. Set to true to use GPT-4o to extract content. Default is `false`. +- `gpt4oApiKey?` Deprecated. Use vendorMultimodal params. Optional. Set the GPT-4o API key. Lowers the cost of parsing by using your own API key. Your OpenAI account will be charged. Can also be set in the environment variable `LLAMA_CLOUD_GPT4O_API_KEY`. - `boundingBox?` Optional. Specify an area of the document to parse. Expects the bounding box margins as a string in clockwise order, e.g. `boundingBox = "0.1,0,0,0"` to not parse the top 10% of the document. - `targetPages?` Optional. Specify which pages to parse by specifying them as a comma-separated list. First page is `0`. +- `useVendorMultimodalModel` set to true to use a multimodal model. Default is `false`. +- `vendorMultimodalModel?` Optional. Specify which multimodal model to use. Default is GPT4o. See [here](https://docs.cloud.llamaindex.ai/llamaparse/features/multimodal) for a list of available models and cost. +- `vendorMultimodalApiKey?` Optional. Set the multimodal model API key. Can also be set in the environment variable `LLAMA_CLOUD_VENDOR_MULTIMODAL_API_KEY`. - `numWorkers` as in the python version, is set in `SimpleDirectoryReader`. Default is 1. ### LlamaParse with SimpleDirectoryReader diff --git a/packages/llamaindex/src/readers/LlamaParseReader.ts b/packages/llamaindex/src/readers/LlamaParseReader.ts index 5acaa4431..061211295 100644 --- a/packages/llamaindex/src/readers/LlamaParseReader.ts +++ b/packages/llamaindex/src/readers/LlamaParseReader.ts @@ -129,9 +129,9 @@ export class LlamaParseReader extends FileReader { doNotUnrollColumns?: boolean; // The page separator to use to split the text. Default is None, which means the parser will use the default separator '\\n---\\n'. pageSeparator?: string; - // Whether to use gpt-4o to extract text from documents. + // Deprecated. Use vendorMultimodal params. Whether to use gpt-4o to extract text from documents. gpt4oMode: boolean = false; - // The API key for the GPT-4o API. Optional, lowers the cost of parsing. Can be set as an env variable: LLAMA_CLOUD_GPT4O_API_KEY. + // Deprecated. Use vendorMultimodal params. The API key for the GPT-4o API. Optional, lowers the cost of parsing. Can be set as an env variable: LLAMA_CLOUD_GPT4O_API_KEY. gpt4oApiKey?: string; // The bounding box to use to extract text from documents. Describe as a string containing the bounding box margins. boundingBox?: string; @@ -139,6 +139,12 @@ export class LlamaParseReader extends FileReader { targetPages?: string; // Whether or not to ignore and skip errors raised during parsing. ignoreErrors: boolean = true; + // Whether to use the vendor multimodal API. + useVendorMultimodalModel: boolean = false; + // The model name for the vendor multimodal API + vendorMultimodalModelName?: string; + // The API key for the multimodal API. Can also be set as an env variable: LLAMA_CLOUD_VENDOR_MULTIMODAL_API_KEY + vendorMultimodalApiKey?: string; // numWorkers is implemented in SimpleDirectoryReader constructor(params: Partial<LlamaParseReader> = {}) { @@ -158,6 +164,13 @@ export class LlamaParseReader extends FileReader { this.gpt4oApiKey = params.gpt4oApiKey; } + if (params.useVendorMultimodalModel) { + params.vendorMultimodalApiKey = + params.vendorMultimodalApiKey ?? + getEnv("LLAMA_CLOUD_VENDOR_MULTIMODAL_API_KEY"); + + this.vendorMultimodalApiKey = params.vendorMultimodalApiKey; + } } // Create a job for the LlamaParse API @@ -189,6 +202,9 @@ export class LlamaParseReader extends FileReader { gpt4o_api_key: this.gpt4oApiKey, bounding_box: this.boundingBox, target_pages: this.targetPages, + use_vendor_multimodal_model: this.useVendorMultimodalModel?.toString(), + vendor_multimodal_model_name: this.vendorMultimodalModelName, + vendor_multimodal_api_key: this.vendorMultimodalApiKey, }; // Appends body with any defined LlamaParseBodyParams -- GitLab