Skip to content
Snippets Groups Projects
Unverified Commit a1a72ab2 authored by Fabian Wimmer's avatar Fabian Wimmer Committed by GitHub
Browse files

feat: LlamaParseReader: update Supported File Types to match python version (#823)

parent b99ab056
Branches
Tags
No related merge requests found
...@@ -3,6 +3,49 @@ import { filetypemime } from "magic-bytes.js"; ...@@ -3,6 +3,49 @@ import { filetypemime } from "magic-bytes.js";
import { Document } from "../Node.js"; import { Document } from "../Node.js";
import type { FileReader, Language, ResultType } from "./type.js"; import type { FileReader, Language, ResultType } from "./type.js";
const SupportedFiles: { [key: string]: string } = {
".pdf": "application/pdf",
".doc": "application/msword",
".docx":
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
".docm": "application/vnd.ms-word.document.macroEnabled.12",
".dot": "application/msword",
".dotx":
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
".dotm": "application/vnd.ms-word.template.macroEnabled.12",
".rtf": "application/rtf",
".wps": "application/vnd.ms-works",
".wpd": "application/wordperfect",
".sxw": "application/vnd.sun.xml.writer",
".stw": "application/vnd.sun.xml.writer.template",
".sxg": "application/vnd.sun.xml.writer.global",
".pages": "application/x-iwork-pages-sffpages",
".mw": "application/macwriteii",
".mcw": "application/macwriteii",
".uot": "application/x-uo",
".uof": "application/vnd.uoml+xml",
".uos": "application/vnd.sun.xml.calc",
".uop": "application/vnd.openofficeorg.presentation",
".ppt": "application/vnd.ms-powerpoint",
".pptx":
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
".pot": "application/vnd.ms-powerpoint",
".pptm": "application/vnd.ms-powerpoint.presentation.macroEnabled.12",
".potx":
"application/vnd.openxmlformats-officedocument.presentationml.template",
".potm": "application/vnd.ms-powerpoint.template.macroEnabled.12",
".key": "application/x-iwork-keynote-sffkey",
".odp": "application/vnd.oasis.opendocument.presentation",
".odg": "application/vnd.oasis.opendocument.graphics",
".otp": "application/vnd.oasis.opendocument.presentation-template",
".fopd": "application/vnd.oasis.opendocument.presentation",
".sxi": "application/vnd.sun.xml.impress",
".sti": "application/vnd.sun.xml.impress.template",
".epub": "application/epub+zip",
".html": "text/html",
".htm": "text/html",
};
/** /**
* Represents a reader for parsing files using the LlamaParse API. * Represents a reader for parsing files using the LlamaParse API.
* See https://github.com/run-llama/llama_parse * See https://github.com/run-llama/llama_parse
...@@ -40,15 +83,12 @@ export class LlamaParseReader implements FileReader { ...@@ -40,15 +83,12 @@ export class LlamaParseReader implements FileReader {
file: string, file: string,
fs: GenericFileSystem = defaultFS, fs: GenericFileSystem = defaultFS,
): Promise<Document[]> { ): Promise<Document[]> {
if (!file.endsWith(".pdf")) {
throw new Error("Currently, only PDF files are supported.");
}
const metadata = { file_path: file }; const metadata = { file_path: file };
// Load data, set the mime type // Load data, set the mime type
const data = await fs.readRawFile(file); const data = await fs.readRawFile(file);
const mimeType = await this.getMimeType(data); const mimeType = await this.getMimeType(data);
const body = new FormData(); const body = new FormData();
body.set("file", new Blob([data], { type: mimeType }), file); body.set("file", new Blob([data], { type: mimeType }), file);
body.append("language", this.language); body.append("language", this.language);
...@@ -67,7 +107,7 @@ export class LlamaParseReader implements FileReader { ...@@ -67,7 +107,7 @@ export class LlamaParseReader implements FileReader {
headers, headers,
}); });
if (!response.ok) { if (!response.ok) {
throw new Error(`Failed to parse the PDF file: ${await response.text()}`); throw new Error(`Failed to parse the file: ${await response.text()}`);
} }
const jsonResponse = await response.json(); const jsonResponse = await response.json();
...@@ -94,7 +134,7 @@ export class LlamaParseReader implements FileReader { ...@@ -94,7 +134,7 @@ export class LlamaParseReader implements FileReader {
const end = Date.now(); const end = Date.now();
if (end - start > this.maxTimeout * 1000) { if (end - start > this.maxTimeout * 1000) {
throw new Error( throw new Error(
`Timeout while parsing the PDF file: ${await response.text()}`, `Timeout while parsing the file: ${await response.text()}`,
); );
} }
if (this.verbose && tries % 10 === 0) { if (this.verbose && tries % 10 === 0) {
...@@ -116,9 +156,16 @@ export class LlamaParseReader implements FileReader { ...@@ -116,9 +156,16 @@ export class LlamaParseReader implements FileReader {
private async getMimeType(data: Buffer): Promise<string> { private async getMimeType(data: Buffer): Promise<string> {
const mimes = filetypemime(data); const mimes = filetypemime(data);
if (!mimes.includes("application/pdf")) { const validMime = mimes.find((mime) =>
throw new Error("Currently, only PDF files are supported."); Object.values(SupportedFiles).includes(mime),
);
if (!validMime) {
const supportedExtensions = Object.keys(SupportedFiles).join(", ");
throw new Error(
`File has type "${mimes}" which does not match supported MIME Types. Supported formats include: ${supportedExtensions}`,
);
} }
return "application/pdf";
return validMime;
} }
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment