diff --git a/apps/docs/docs/modules/data_loader.mdx b/apps/docs/docs/modules/data_loader.mdx index 2bf5b85583efca916f878301e6eb15ebf84f65e3..7445e55329b8068167e88a3e8a0fa48f88912306 100644 --- a/apps/docs/docs/modules/data_loader.mdx +++ b/apps/docs/docs/modules/data_loader.mdx @@ -6,6 +6,7 @@ import CodeBlock from "@theme/CodeBlock"; import CodeSource from "!raw-loader!../../../../examples/readers/src/simple-directory-reader"; import CodeSource2 from "!raw-loader!../../../../examples/readers/src/custom-simple-directory-reader"; import CodeSource3 from "!raw-loader!../../../../examples/readers/src/llamaparse"; +import CodeSource4 from "!raw-loader!../../../../examples/readers/src/simple-directory-reader-with-llamaparse.ts"; # Loader @@ -21,11 +22,13 @@ It is a simple reader that reads all files from a directory and its subdirectori <CodeBlock language="ts">{CodeSource}</CodeBlock> -Currently, it supports reading `.csv`, `.docx`, `.html`, `.md` and `.pdf` files, -but support for other file types is planned. +Currently, it supports reading `.txt`, `.pdf`, `.csv`, `.md`, `.docx`, `.htm`, `.html`, `.jpg`, `.jpeg`, `.png` and `.gif` files, but support for other file types is planned. -Also, you can provide a `defaultReader` as a fallback for files with unsupported extensions. -Or pass new readers for `fileExtToReader` to support more file types. +You can override the default reader for all file types, inlcuding unsupported ones, with the `overrideReader` option. +Additionally, you can override the default reader for specific file types or add support for additional file types with the `fileExtToReader` option. +Also, you can provide a `defaultReader` as a fallback for files with unsupported extensions. By default it is `TextFileReader`. + +SimpleDirectoryReader supports up to 9 concurrent requests. Use the `numWorkers` option to set the number of concurrent requests. By default it runs in sequential mode, i.e. set to 1. <CodeBlock language="ts" showLineNumbers metastring="{8-12,17-21}"> {CodeSource2} @@ -35,14 +38,31 @@ Or pass new readers for `fileExtToReader` to support more file types. LlamaParse is an API created by LlamaIndex to efficiently parse files, e.g. it's great at converting PDF tables into markdown. -To use it, first login and get an API key from https://cloud.llamaindex.ai. Make sure to store the key in the environment variable `LLAMA_CLOUD_API_KEY`. +To use it, first login and get an API key from https://cloud.llamaindex.ai. Make sure to store the key as `apiKey` parameter or in the environment variable `LLAMA_CLOUD_API_KEY`. -Then, you can use the `LlamaParseReader` class to read a local PDF file and convert it into a markdown document that can be used by LlamaIndex: +Then, you can use the `LlamaParseReader` class to local files and convert them into a parsed document that can be used by LlamaIndex. +See [LlamaParseReader.ts](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/readers/LlamaParseReader.ts#L6) for a list of supported file types: <CodeBlock language="ts">{CodeSource3}</CodeBlock> -Alternatively, you can set the [`resultType`](../api/classes/LlamaParseReader.md#resulttype) option to `text` to get the parsed document as a text string. +Additional options can be set with the `LlamaParseReader` constructor: + +- `resultType` can be set to `markdown`, `text` or `.json`. Defaults to `text` +- `language` primarly helps with OCR recognition. Defaults to `en`. See [../readers/type.ts](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/readers/type.ts#L20) for a list of supported languages. +- `parsingInstructions` can help with complicated document structures. See this [LlamaIndex Blog Post](https://www.llamaindex.ai/blog/launching-the-first-genai-native-document-parsing-platform) for an example. +- `skipDiagonalText` set to true to ignore diagonal text. +- `invalidateCache` set to true to ignore the LlamaCloud cache. All document are kept in cache for 48hours after the job was completed to avoid processing the same document twice. Can be useful for testing when trying to re-parse the same document with, e.g. different `parsingInstructions`. +- `gpt4oMode` set to true to use GPT-4o to extract content. +- `gpt4oApiKey` set the GPT-4o API key. Optional. Lowers the cost of parsing by using your own API key. Your OpenAI account will be charged. Can also be set in the environment variable `LLAMA_CLOUD_GPT4O_API_KEY`. +- `numWorkers` as in the python version, is set in `SimpleDirectoryReader`. Default is 1. + +## LlamaParse with SimpleDirectoryReader + +Below a full example of `LlamaParse` integrated in `SimpleDirectoryReader` with additional options. + +<CodeBlock language="ts">{CodeSource4}</CodeBlock> ## API Reference - [SimpleDirectoryReader](../api/classes/SimpleDirectoryReader.md) +- [LlamaParseReader](../api/classes/LlamaParseReader.md) diff --git a/packages/core/src/readers/LlamaParseReader.ts b/packages/core/src/readers/LlamaParseReader.ts index 2fc6dbcefadb80190881c4884731252a8078d8ca..23ca173205fecdd4c42e37a5a559929bc914ca04 100644 --- a/packages/core/src/readers/LlamaParseReader.ts +++ b/packages/core/src/readers/LlamaParseReader.ts @@ -106,7 +106,7 @@ const SupportedFiles: { [key: string]: string } = { * See https://github.com/run-llama/llama_parse */ export class LlamaParseReader implements FileReader { - // The API key for the LlamaParse API. + // The API key for the LlamaParse API. Can be set as an environment variable: LLAMA_CLOUD_API_KEY apiKey: string; // The base URL of the Llama Parsing API. baseUrl: string = "https://api.cloud.llamaindex.ai/api/parsing"; @@ -124,11 +124,11 @@ export class LlamaParseReader implements FileReader { parsingInstruction: string = ""; // If set to true, the parser will ignore diagonal text (when the text rotation in degrees modulo 90 is not 0). skipDiagonalText: boolean = false; - // If set to true, the cache will be ignored and the document re-processes. All document are kept in cache for 48hours after the job was completed to avoid processing 2 time the same document. + // If set to true, the cache will be ignored and the document re-processes. All document are kept in cache for 48hours after the job was completed to avoid processing the same document twice. invalidateCache: boolean = false; - // Whether to use gpt-4o extract text from documents. + // Whether to use gpt-4o to extract text from documents. gpt4oMode: boolean = false; - // The API key for the GPT-4o API. Lowers the cost of parsing. + // The API key for the GPT-4o API. Optional, lowers the cost of parsing. Can be set as an env variable: LLAMA_CLOUD_GPT4O_API_KEY. gpt4oApiKey?: string; // numWorkers is implemented in SimpleDirectoryReader