Skip to content
Snippets Groups Projects
Unverified Commit 9b86bbd2 authored by Sean Hatfield's avatar Sean Hatfield Committed by GitHub
Browse files

[FIX] PDFLoader module bug fix (#1879)

use pdf.js by importing it from pdf-parse and fix custom PDFLoader module
parent 86a66ba5
No related branches found
No related tags found
No related merge requests found
const fs = require("fs").promises; const fs = require("fs").promises;
const pdf = require("pdf-parse");
class PDFLoader { class PDFLoader {
constructor(filePath, { splitPages = true } = {}) { constructor(filePath, { splitPages = true } = {}) {
...@@ -9,54 +8,90 @@ class PDFLoader { ...@@ -9,54 +8,90 @@ class PDFLoader {
async load() { async load() {
const buffer = await fs.readFile(this.filePath); const buffer = await fs.readFile(this.filePath);
const { getDocument, version } = await this.getPdfJS();
const options = { const pdf = await getDocument({
pagerender: this.splitPages ? this.renderPage : null, data: new Uint8Array(buffer),
}; useWorkerFetch: false,
isEvalSupported: false,
const { text, numpages, info, metadata, version } = await pdf( useSystemFonts: true,
buffer, }).promise;
options
); const meta = await pdf.getMetadata().catch(() => null);
const documents = [];
if (!this.splitPages) {
return [ for (let i = 1; i <= pdf.numPages; i += 1) {
{ const page = await pdf.getPage(i);
pageContent: text.trim(), const content = await page.getTextContent();
metadata: {
source: this.filePath, if (content.items.length === 0) {
pdf: { version, info, metadata, totalPages: numpages }, continue;
}
let lastY;
const textItems = [];
for (const item of content.items) {
if ("str" in item) {
if (lastY === item.transform[5] || !lastY) {
textItems.push(item.str);
} else {
textItems.push(`\n${item.str}`);
}
lastY = item.transform[5];
}
}
const text = textItems.join("");
documents.push({
pageContent: text.trim(),
metadata: {
source: this.filePath,
pdf: {
version,
info: meta?.info,
metadata: meta?.metadata,
totalPages: pdf.numPages,
}, },
loc: { pageNumber: i },
}, },
]; });
}
if (this.splitPages) {
return documents;
}
if (documents.length === 0) {
return [];
} }
return this.pages.map((pageContent, index) => ({ return [
pageContent: pageContent.trim(), {
metadata: { pageContent: documents.map((doc) => doc.pageContent).join("\n\n"),
source: this.filePath, metadata: {
pdf: { version, info, metadata, totalPages: numpages }, source: this.filePath,
loc: { pageNumber: index + 1 }, pdf: {
version,
info: meta?.info,
metadata: meta?.metadata,
totalPages: pdf.numPages,
},
},
}, },
})); ];
} }
pages = []; async getPdfJS() {
try {
renderPage = async (pageData) => { const pdfjs = await import("pdf-parse/lib/pdf.js/v1.10.100/build/pdf.js");
const textContent = await pageData.getTextContent(); return { getDocument: pdfjs.getDocument, version: pdfjs.version };
let lastY, } catch (e) {
text = ""; console.error(e);
for (const item of textContent.items) { throw new Error(
if (lastY !== item.transform[5] && lastY !== undefined) { "Failed to load pdf-parse. Please install it with eg. `npm install pdf-parse`."
text += "\n"; );
}
text += item.str;
lastY = item.transform[5];
} }
this.pages.push(text); }
return text;
};
} }
module.exports = PDFLoader; module.exports = PDFLoader;
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment