docs: update data loader documentation (#900)

Co-authored-by: Alex Yang <himself65@outlook.com>

docs: update data loader documentation (#900)
174cb3e6 · Fabian Wimmer · GitHub · 5ab5e519 · 174cb3e6 · 174cb3e6
Unverified Commit 174cb3e6 authored 10 months ago by Fabian Wimmer Committed by GitHub 10 months ago
--- a/apps/docs/docs/modules/data_loader.mdx
+++ b/apps/docs/docs/modules/data_loader.mdx
@@ -6,6 +6,7 @@ import CodeBlock from "@theme/CodeBlock";
 import CodeSource from "!raw-loader!../../../../examples/readers/src/simple-directory-reader";
 import CodeSource2 from "!raw-loader!../../../../examples/readers/src/custom-simple-directory-reader";
 import CodeSource3 from "!raw-loader!../../../../examples/readers/src/llamaparse";
+import CodeSource4 from "!raw-loader!../../../../examples/readers/src/simple-directory-reader-with-llamaparse.ts";
 # Loader
@@ -21,11 +22,13 @@ It is a simple reader that reads all files from a directory and its subdirectori
 <CodeBlock language="ts">{CodeSource}</CodeBlock>
-Currently, it supports reading `.csv`, `.docx`, `.html`, `.md` and `.pdf` files,
+Currently, it supports reading `.txt`, `.pdf`, `.csv`, `.md`, `.docx`, `.htm`, `.html`, `.jpg`, `.jpeg`, `.png` and `.gif` files, but support for other file types is planned.
-but support for other file types is planned.
-Also, you can provide a `defaultReader` as a fallback for files with unsupported extensions.
+You can override the default reader for all file types, inlcuding unsupported ones, with the `overrideReader` option.
-Or pass new readers for `fileExtToReader` to support more file types.
+Additionally, you can override the default reader for specific file types or add support for additional file types with the `fileExtToReader` option.
+Also, you can provide a `defaultReader` as a fallback for files with unsupported extensions. By default it is `TextFileReader`.
+SimpleDirectoryReader supports up to 9 concurrent requests. Use the `numWorkers` option to set the number of concurrent requests. By default it runs in sequential mode, i.e. set to 1.
 <CodeBlock language="ts" showLineNumbers metastring="{8-12,17-21}">
  {CodeSource2}
@@ -35,14 +38,31 @@ Or pass new readers for `fileExtToReader` to support more file types.
 LlamaParse is an API created by LlamaIndex to efficiently parse files, e.g. it's great at converting PDF tables into markdown.
-To use it, first login and get an API key from https://cloud.llamaindex.ai. Make sure to store the key in the environment variable `LLAMA_CLOUD_API_KEY`.
+To use it, first login and get an API key from https://cloud.llamaindex.ai. Make sure to store the key as `apiKey` parameter or in the environment variable `LLAMA_CLOUD_API_KEY`.
-Then, you can use the `LlamaParseReader` class to read a local PDF file and convert it into a markdown document that can be used by LlamaIndex:
+Then, you can use the `LlamaParseReader` class to local files and convert them into a parsed document that can be used by LlamaIndex.
+See [LlamaParseReader.ts](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/readers/LlamaParseReader.ts#L6) for a list of supported file types:
 <CodeBlock language="ts">{CodeSource3}</CodeBlock>
-Alternatively, you can set the [`resultType`](../api/classes/LlamaParseReader.md#resulttype) option to `text` to get the parsed document as a text string.
+Additional options can be set with the `LlamaParseReader` constructor:
+- `resultType` can be set to `markdown`, `text` or `.json`. Defaults to `text`
+- `language` primarly helps with OCR recognition. Defaults to `en`. See [../readers/type.ts](https://github.com/run-llama/LlamaIndexTS/blob/main/packages/core/src/readers/type.ts#L20) for a list of supported languages.
+- `parsingInstructions` can help with complicated document structures. See this [LlamaIndex Blog Post](https://www.llamaindex.ai/blog/launching-the-first-genai-native-document-parsing-platform) for an example.
+- `skipDiagonalText` set to true to ignore diagonal text.
+- `invalidateCache` set to true to ignore the LlamaCloud cache. All document are kept in cache for 48hours after the job was completed to avoid processing the same document twice. Can be useful for testing when trying to re-parse the same document with, e.g. different `parsingInstructions`.
+- `gpt4oMode` set to true to use GPT-4o to extract content.
+- `gpt4oApiKey` set the GPT-4o API key. Optional. Lowers the cost of parsing by using your own API key. Your OpenAI account will be charged. Can also be set in the environment variable `LLAMA_CLOUD_GPT4O_API_KEY`.
+- `numWorkers` as in the python version, is set in `SimpleDirectoryReader`. Default is 1.
+## LlamaParse with SimpleDirectoryReader
+Below a full example of `LlamaParse` integrated in `SimpleDirectoryReader` with additional options.
+<CodeBlock language="ts">{CodeSource4}</CodeBlock>
 ## API Reference
 - [SimpleDirectoryReader](../api/classes/SimpleDirectoryReader.md)
+- [LlamaParseReader](../api/classes/LlamaParseReader.md)
--- a/packages/core/src/readers/LlamaParseReader.ts
+++ b/packages/core/src/readers/LlamaParseReader.ts
@@ -106,7 +106,7 @@ const SupportedFiles: { [key: string]: string } = {
 * See https://github.com/run-llama/llama_parse
 */
 export class LlamaParseReader implements FileReader {
-  // The API key for the LlamaParse API.
+  // The API key for the LlamaParse API. Can be set as an environment variable: LLAMA_CLOUD_API_KEY
  apiKey: string;
  // The base URL of the Llama Parsing API.
  baseUrl: string = "https://api.cloud.llamaindex.ai/api/parsing";
@@ -124,11 +124,11 @@ export class LlamaParseReader implements FileReader {
  parsingInstruction: string = "";
  // If set to true, the parser will ignore diagonal text (when the text rotation in degrees modulo 90 is not 0).
  skipDiagonalText: boolean = false;
-  // If set to true, the cache will be ignored and the document re-processes. All document are kept in cache for 48hours after the job was completed to avoid processing 2 time the same document.
+  // If set to true, the cache will be ignored and the document re-processes. All document are kept in cache for 48hours after the job was completed to avoid processing the same document twice.
  invalidateCache: boolean = false;
-  // Whether to use gpt-4o extract text from documents.
+  // Whether to use gpt-4o to extract text from documents.
  gpt4oMode: boolean = false;
-  // The API key for the GPT-4o API. Lowers the cost of parsing.
+  // The API key for the GPT-4o API. Optional, lowers the cost of parsing. Can be set as an env variable: LLAMA_CLOUD_GPT4O_API_KEY.
  gpt4oApiKey?: string;
  // numWorkers is implemented in SimpleDirectoryReader