From fb6db454d416d8fdb0eceaaf81ba2952d2f3889e Mon Sep 17 00:00:00 2001 From: Fabian Wimmer <github@insightby.ai> Date: Thu, 25 Jul 2024 10:09:43 +0200 Subject: [PATCH] feat: update/add pageSeparator params to LlamaParseReader (#1051) --- .changeset/weak-news-train.md | 5 +++ .../docs/modules/data_loaders/discord.mdx | 2 +- apps/docs/docs/modules/data_loaders/index.mdx | 10 ++++- .../data_loaders/llama_parse/index.mdx | 4 +- .../src/readers/LlamaParseReader.ts | 37 ++++++++++++++++++- 5 files changed, 53 insertions(+), 5 deletions(-) create mode 100644 .changeset/weak-news-train.md diff --git a/.changeset/weak-news-train.md b/.changeset/weak-news-train.md new file mode 100644 index 000000000..f8a8794db --- /dev/null +++ b/.changeset/weak-news-train.md @@ -0,0 +1,5 @@ +--- +"llamaindex": patch +--- + +feat: add pageSeparator params to LlamaParseReader diff --git a/apps/docs/docs/modules/data_loaders/discord.mdx b/apps/docs/docs/modules/data_loaders/discord.mdx index 756123532..ffb4c312e 100644 --- a/apps/docs/docs/modules/data_loaders/discord.mdx +++ b/apps/docs/docs/modules/data_loaders/discord.mdx @@ -20,7 +20,7 @@ Copy the URL in your browser and select the server you want your bot to join. #### DiscordReader() - `discordToken?`: The Discord bot token. -- `makeRequest?`: Optionally provide a custom request function for edge environments, e.g. `fetch`. See discord.js for more info. +- `requestHandler?`: Optionally provide a custom request function for edge environments, e.g. `fetch`. See discord.js for more info. #### DiscordReader.loadData diff --git a/apps/docs/docs/modules/data_loaders/index.mdx b/apps/docs/docs/modules/data_loaders/index.mdx index 710515ebd..2fbf9bc8e 100644 --- a/apps/docs/docs/modules/data_loaders/index.mdx +++ b/apps/docs/docs/modules/data_loaders/index.mdx @@ -16,7 +16,15 @@ It is a simple reader that reads all files from a directory and its subdirectori <CodeBlock language="ts">{CodeSource}</CodeBlock> -Currently, it supports reading `.txt`, `.pdf`, `.csv`, `.md`, `.docx`, `.htm`, `.html`, `.jpg`, `.jpeg`, `.png` and `.gif` files, but support for other file types is planned. +Currently, the following readers are mapped to specific file types: + +- [TextFileReader](../../api/classes/TextFileReader.md): `.txt` +- [PDFReader](../../api/classes/PDFReader.md): `.pdf` +- [PapaCSVReader](../../api/classes/PapaCSVReader.md): `.csv` +- [MarkdownReader](../../api/classes/MarkdownReader.md): `.md` +- [DocxReader](../../api/classes/DocxReader.md): `.docx` +- [HTMLReader](../../api/classes/HTMLReader.md): `.htm`, `.html` +- [ImageReader](../../api/classes/ImageReader.md): `.jpg`, `.jpeg`, `.png`, `.gif` You can modify the reader three different ways: diff --git a/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx b/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx index a6f6c2750..0efff3713 100644 --- a/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx +++ b/apps/docs/docs/modules/data_loaders/llama_parse/index.mdx @@ -41,7 +41,9 @@ They can be divided into two groups. - `doNotCache?` Optional. Set to true to not cache the document. - `fastMode?` Optional. Set to true to use the fast mode. This mode will skip OCR of images, and table/heading reconstruction. Note: Non-compatible with `gpt4oMode`. - `doNotUnrollColumns?` Optional. Set to true to keep the text according to document layout. Reduce reconstruction accuracy, and LLMs/embeddings performances in most cases. -- `pageSeparator?` Optional. The page separator to use. Defaults is `\\n---\\n`. +- `pageSeparator?` Optional. A templated page separator to use to split the text. If the results contain `{page_number}` (e.g. JSON mode), it will be replaced by the next page number. If not set the default separator `\\n---\\n` will be used. +- `pagePrefix?` Optional. A templated prefix to add to the beginning of each page. If the results contain `{page_number}`, it will be replaced by the page number. +- `pageSuffix?` Optional. A templated suffix to add to the end of each page. If the results contain `{page_number}`, it will be replaced by the page number. - `gpt4oMode` Deprecated. Use vendorMultimodal params. Set to true to use GPT-4o to extract content. Default is `false`. - `gpt4oApiKey?` Deprecated. Use vendorMultimodal params. Optional. Set the GPT-4o API key. Lowers the cost of parsing by using your own API key. Your OpenAI account will be charged. Can also be set in the environment variable `LLAMA_CLOUD_GPT4O_API_KEY`. - `boundingBox?` Optional. Specify an area of the document to parse. Expects the bounding box margins as a string in clockwise order, e.g. `boundingBox = "0.1,0,0,0"` to not parse the top 10% of the document. diff --git a/packages/llamaindex/src/readers/LlamaParseReader.ts b/packages/llamaindex/src/readers/LlamaParseReader.ts index bc9673c71..51e3d2e81 100644 --- a/packages/llamaindex/src/readers/LlamaParseReader.ts +++ b/packages/llamaindex/src/readers/LlamaParseReader.ts @@ -127,8 +127,12 @@ export class LlamaParseReader extends FileReader { fastMode?: boolean; // Wether to keep column in the text according to document layout. Reduce reconstruction accuracy, and LLM's/embedings performances in most cases. doNotUnrollColumns?: boolean; - // The page separator to use to split the text. Default is None, which means the parser will use the default separator '\\n---\\n'. + // A templated page separator to use to split the text. If the results contain `{page_number}` (e.g. JSON mode), it will be replaced by the next page number. If not set the default separator '\\n---\\n' will be used. pageSeparator?: string; + //A templated prefix to add to the beginning of each page. If the results contain `{page_number}`, it will be replaced by the page number.> + pagePrefix?: string; + // A templated suffix to add to the end of each page. If the results contain `{page_number}`, it will be replaced by the page number. + pageSuffix?: string; // Deprecated. Use vendorMultimodal params. Whether to use gpt-4o to extract text from documents. gpt4oMode: boolean = false; // Deprecated. Use vendorMultimodal params. The API key for the GPT-4o API. Optional, lowers the cost of parsing. Can be set as an env variable: LLAMA_CLOUD_GPT4O_API_KEY. @@ -198,6 +202,8 @@ export class LlamaParseReader extends FileReader { fast_mode: this.fastMode?.toString(), do_not_unroll_columns: this.doNotUnrollColumns?.toString(), page_separator: this.pageSeparator, + page_prefix: this.pagePrefix, + page_suffix: this.pageSuffix, gpt4o_mode: this.gpt4oMode?.toString(), gpt4o_api_key: this.gpt4oApiKey, bounding_box: this.boundingBox, @@ -207,8 +213,17 @@ export class LlamaParseReader extends FileReader { vendor_multimodal_api_key: this.vendorMultimodalApiKey, }; + // Filter out params with invalid values that would cause issues on the backend. + const filteredParams = this.filterSpecificParams(LlamaParseBodyParams, [ + "page_separator", + "page_prefix", + "page_suffix", + "bounding_box", + "target_pages", + ]); + // Appends body with any defined LlamaParseBodyParams - Object.entries(LlamaParseBodyParams).forEach(([key, value]) => { + Object.entries(filteredParams).forEach(([key, value]) => { if (value !== undefined) { body.append(key, value); } @@ -452,6 +467,24 @@ export class LlamaParseReader extends FileReader { await fs.writeFile(imagePath, buffer); } + // Filters out invalid values (null, undefined, empty string) of specific params. + private filterSpecificParams( + params: Record<string, any>, + keysToCheck: string[], + ): Record<string, any> { + const filteredParams: Record<string, any> = {}; + for (const [key, value] of Object.entries(params)) { + if (keysToCheck.includes(key)) { + if (value !== null && value !== undefined && value !== "") { + filteredParams[key] = value; + } + } else { + filteredParams[key] = value; + } + } + return filteredParams; + } + static async getMimeType( data: Uint8Array, ): Promise<{ mime: string; extension: string }> { -- GitLab