diff --git a/examples/data/manga.pdf b/examples/data/manga.pdf new file mode 100644 index 0000000000000000000000000000000000000000..fdc44d375adcb6e012e7cfe00427d91c50deaa97 Binary files /dev/null and b/examples/data/manga.pdf differ diff --git a/examples/readers/package.json b/examples/readers/package.json index 3889180eda301598ba191da628d7d95e32ff4b6b..baf68a0915351313527bd4c5bf80570959737350 100644 --- a/examples/readers/package.json +++ b/examples/readers/package.json @@ -10,7 +10,8 @@ "start:markdown": "node --import tsx ./src/markdown.ts", "start:pdf": "node --import tsx ./src/pdf.ts", "start:llamaparse": "node --import tsx ./src/llamaparse.ts", - "start:notion": "node --import tsx ./src/notion.ts" + "start:notion": "node --import tsx ./src/notion.ts", + "start:llamaparse2": "node --import tsx ./src/llamaparse_2.ts" }, "dependencies": { "llamaindex": "*" diff --git a/examples/readers/src/llamaparse_2.ts b/examples/readers/src/llamaparse_2.ts new file mode 100644 index 0000000000000000000000000000000000000000..369079e9bae39f82023e9e45306d763f5be9bc9c --- /dev/null +++ b/examples/readers/src/llamaparse_2.ts @@ -0,0 +1,26 @@ +import fs from "fs/promises"; +import { LlamaParseReader } from "llamaindex"; + +async function main() { + // Load PDF using LlamaParse. set apiKey here or in environment variable LLAMA_CLOUD_API_KEY + const reader = new LlamaParseReader({ + resultType: "markdown", + language: "en", + parsingInstruction: + "The provided document is a manga comic book. Most pages do NOT have title. It does not contain tables. Try to reconstruct the dialogue happening in a cohesive way. Output any math equation in LATEX markdown (between $$)", + }); + const documents = await reader.loadData("../data/manga.pdf"); // The manga.pdf in the data folder is just a copy of the TOS, due to copyright laws. You have to place your own. I used "The Manga Guide to Calculus" by Hiroyuki Kojima + + // Assuming documents contain an array of pages or sections + const parsedManga = documents.map((page) => page.text).join("\n---\n"); + + // Output the parsed manga to .md file. Will be placed in ../example/readers/ + try { + await fs.writeFile("./parsedManga.md", parsedManga); + console.log("Output successfully written to parsedManga.md"); + } catch (err) { + console.error("Error writing to file:", err); + } +} + +main().catch(console.error); diff --git a/packages/core/src/readers/LlamaParseReader.ts b/packages/core/src/readers/LlamaParseReader.ts index daf5eb1abbe9eb74d3031b603ee2e43de1a301e2..81761174320b4a752b219850bb2a1d8e6eb7740b 100644 --- a/packages/core/src/readers/LlamaParseReader.ts +++ b/packages/core/src/readers/LlamaParseReader.ts @@ -1,9 +1,7 @@ import { defaultFS, getEnv, type GenericFileSystem } from "@llamaindex/env"; import { filetypemime } from "magic-bytes.js"; import { Document } from "../Node.js"; -import type { FileReader } from "./type.js"; - -type ResultType = "text" | "markdown" | "json"; +import type { FileReader, Language, ResultType } from "./type.js"; /** * Represents a reader for parsing files using the LlamaParse API. @@ -20,7 +18,12 @@ export class LlamaParseReader implements FileReader { checkInterval = 1; // Whether to print the progress of the parsing. verbose = true; + // The result type for the parser. resultType: ResultType = "text"; + // The language of the text to parse. + language: Language = "en"; + // The parsing instruction for the parser. + parsingInstruction: string = ""; constructor(params: Partial<LlamaParseReader> = {}) { Object.assign(this, params); @@ -48,6 +51,8 @@ export class LlamaParseReader implements FileReader { const mimeType = await this.getMimeType(data); const body = new FormData(); body.set("file", new Blob([data], { type: mimeType }), file); + body.append("language", this.language); + body.append("parsingInstruction", this.parsingInstruction); const headers = { Authorization: `Bearer ${this.apiKey}`, diff --git a/packages/core/src/readers/type.ts b/packages/core/src/readers/type.ts index a260824d31a2d41054dd1e6f55ab465b54b5d4eb..59128962125ce2aa5f740acf2a68a09d268af703 100644 --- a/packages/core/src/readers/type.ts +++ b/packages/core/src/readers/type.ts @@ -14,3 +14,91 @@ export interface BaseReader { export interface FileReader extends BaseReader { loadData(filePath: string, fs?: CompleteFileSystem): Promise<Document[]>; } + +// For LlamaParseReader.ts + +export type ResultType = "text" | "markdown" | "json"; +export type Language = + | "abq" + | "ady" + | "af" + | "ang" + | "ar" + | "as" + | "ava" + | "az" + | "be" + | "bg" + | "bh" + | "bho" + | "bn" + | "bs" + | "ch_sim" + | "ch_tra" + | "che" + | "cs" + | "cy" + | "da" + | "dar" + | "de" + | "en" + | "es" + | "et" + | "fa" + | "fr" + | "ga" + | "gom" + | "hi" + | "hr" + | "hu" + | "id" + | "inh" + | "is" + | "it" + | "ja" + | "kbd" + | "kn" + | "ko" + | "ku" + | "la" + | "lbe" + | "lez" + | "lt" + | "lv" + | "mah" + | "mai" + | "mi" + | "mn" + | "mr" + | "ms" + | "mt" + | "ne" + | "new" + | "nl" + | "no" + | "oc" + | "pi" + | "pl" + | "pt" + | "ro" + | "ru" + | "rs_cyrillic" + | "rs_latin" + | "sck" + | "sk" + | "sl" + | "sq" + | "sv" + | "sw" + | "ta" + | "tab" + | "te" + | "th" + | "tjk" + | "tl" + | "tr" + | "ug" + | "uk" + | "ur" + | "uz" + | "vi";