Skip to content
Snippets Groups Projects
Unverified Commit 345300f1 authored by Fabian Wimmer's avatar Fabian Wimmer Committed by GitHub
Browse files

feat: add split by page mode to LlamaParseReader (#924)

parent f322c5d2
No related branches found
No related tags found
No related merge requests found
---
"llamaindex": patch
---
feat: add splitByPage mode to LlamaParseReader
...@@ -48,6 +48,7 @@ They can be divided into two groups. ...@@ -48,6 +48,7 @@ They can be divided into two groups.
- `gpt4oApiKey?` Deprecated. Use vendorMultimodal params. Optional. Set the GPT-4o API key. Lowers the cost of parsing by using your own API key. Your OpenAI account will be charged. Can also be set in the environment variable `LLAMA_CLOUD_GPT4O_API_KEY`. - `gpt4oApiKey?` Deprecated. Use vendorMultimodal params. Optional. Set the GPT-4o API key. Lowers the cost of parsing by using your own API key. Your OpenAI account will be charged. Can also be set in the environment variable `LLAMA_CLOUD_GPT4O_API_KEY`.
- `boundingBox?` Optional. Specify an area of the document to parse. Expects the bounding box margins as a string in clockwise order, e.g. `boundingBox = "0.1,0,0,0"` to not parse the top 10% of the document. - `boundingBox?` Optional. Specify an area of the document to parse. Expects the bounding box margins as a string in clockwise order, e.g. `boundingBox = "0.1,0,0,0"` to not parse the top 10% of the document.
- `targetPages?` Optional. Specify which pages to parse by specifying them as a comma-separated list. First page is `0`. - `targetPages?` Optional. Specify which pages to parse by specifying them as a comma-separated list. First page is `0`.
- `splitByPage` Wether to split the results, creating one document per page. Uses the set `pageSeparator` or `\n---\n` as fallback. Default is true.
- `useVendorMultimodalModel` set to true to use a multimodal model. Default is `false`. - `useVendorMultimodalModel` set to true to use a multimodal model. Default is `false`.
- `vendorMultimodalModel?` Optional. Specify which multimodal model to use. Default is GPT4o. See [here](https://docs.cloud.llamaindex.ai/llamaparse/features/multimodal) for a list of available models and cost. - `vendorMultimodalModel?` Optional. Specify which multimodal model to use. Default is GPT4o. See [here](https://docs.cloud.llamaindex.ai/llamaparse/features/multimodal) for a list of available models and cost.
- `vendorMultimodalApiKey?` Optional. Set the multimodal model API key. Can also be set in the environment variable `LLAMA_CLOUD_VENDOR_MULTIMODAL_API_KEY`. - `vendorMultimodalApiKey?` Optional. Set the multimodal model API key. Can also be set in the environment variable `LLAMA_CLOUD_VENDOR_MULTIMODAL_API_KEY`.
......
...@@ -143,6 +143,8 @@ export class LlamaParseReader extends FileReader { ...@@ -143,6 +143,8 @@ export class LlamaParseReader extends FileReader {
targetPages?: string; targetPages?: string;
// Whether or not to ignore and skip errors raised during parsing. // Whether or not to ignore and skip errors raised during parsing.
ignoreErrors: boolean = true; ignoreErrors: boolean = true;
// Whether to split by page using the pageSeparator or '\n---\n' as default.
splitByPage: boolean = true;
// Whether to use the vendor multimodal API. // Whether to use the vendor multimodal API.
useVendorMultimodalModel: boolean = false; useVendorMultimodalModel: boolean = false;
// The model name for the vendor multimodal API // The model name for the vendor multimodal API
...@@ -326,10 +328,17 @@ export class LlamaParseReader extends FileReader { ...@@ -326,10 +328,17 @@ export class LlamaParseReader extends FileReader {
} }
// Return results as Document objects // Return results as Document objects
const resultJson = await this.getJobResult(jobId, this.resultType); const jobResults = await this.getJobResult(jobId, this.resultType);
const resultText = jobResults[this.resultType];
// Split the text by separator if splitByPage is true
if (this.splitByPage) {
return this.splitTextBySeparator(resultText);
}
return [ return [
new Document({ new Document({
text: resultJson[this.resultType], text: resultText,
}), }),
]; ];
} catch (e) { } catch (e) {
...@@ -485,6 +494,17 @@ export class LlamaParseReader extends FileReader { ...@@ -485,6 +494,17 @@ export class LlamaParseReader extends FileReader {
return filteredParams; return filteredParams;
} }
private splitTextBySeparator(text: string): Document[] {
const separator = this.pageSeparator ?? "\n---\n";
const textChunks = text.split(separator);
return textChunks.map(
(docChunk: string) =>
new Document({
text: docChunk,
}),
);
}
static async getMimeType( static async getMimeType(
data: Uint8Array, data: Uint8Array,
): Promise<{ mime: string; extension: string }> { ): Promise<{ mime: string; extension: string }> {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment