Skip to content
Snippets Groups Projects
Unverified Commit a8701482 authored by Sean Hatfield's avatar Sean Hatfield Committed by GitHub
Browse files

[REFACTOR] Improve asPDF collector processor with pdfjs (#1791)


* WIP replace langchain pdfloader with pdfjs and add more context to each page

* remove extras from pdfjs and just replace langchain library

* remove unneeded dep

* fix console log in docs

---------

Co-authored-by: default avatartimothycarambat <rambat1010@gmail.com>
parent 3fa00ad7
No related branches found
No related tags found
No related merge requests found
...@@ -37,7 +37,7 @@ ...@@ -37,7 +37,7 @@
"node-html-parser": "^6.1.13", "node-html-parser": "^6.1.13",
"officeparser": "^4.0.5", "officeparser": "^4.0.5",
"openai": "4.38.5", "openai": "4.38.5",
"pdf-parse": "^1.1.1", "pdfjs-dist": "3.4.120",
"puppeteer": "~21.5.2", "puppeteer": "~21.5.2",
"slugify": "^1.6.6", "slugify": "^1.6.6",
"url-pattern": "^1.0.3", "url-pattern": "^1.0.3",
...@@ -49,4 +49,4 @@ ...@@ -49,4 +49,4 @@
"nodemon": "^2.0.22", "nodemon": "^2.0.22",
"prettier": "^2.4.1" "prettier": "^2.4.1"
} }
} }
\ No newline at end of file
...@@ -15,7 +15,6 @@ async function asDocX({ fullFilePath = "", filename = "" }) { ...@@ -15,7 +15,6 @@ async function asDocX({ fullFilePath = "", filename = "" }) {
let pageContent = []; let pageContent = [];
const docs = await loader.load(); const docs = await loader.load();
for (const doc of docs) { for (const doc of docs) {
console.log(doc.metadata);
console.log(`-- Parsing content from docx page --`); console.log(`-- Parsing content from docx page --`);
if (!doc.pageContent.length) continue; if (!doc.pageContent.length) continue;
pageContent.push(doc.pageContent); pageContent.push(doc.pageContent);
......
const { v4 } = require("uuid"); const { v4 } = require("uuid");
const { PDFLoader } = require("langchain/document_loaders/fs/pdf");
const { const {
createdDate, createdDate,
trashFile, trashFile,
...@@ -9,21 +8,24 @@ const { tokenizeString } = require("../../utils/tokenizer"); ...@@ -9,21 +8,24 @@ const { tokenizeString } = require("../../utils/tokenizer");
const { default: slugify } = require("slugify"); const { default: slugify } = require("slugify");
async function asPDF({ fullFilePath = "", filename = "" }) { async function asPDF({ fullFilePath = "", filename = "" }) {
const pdfLoader = new PDFLoader(fullFilePath, { const pdfjsLib = await import("pdfjs-dist");
splitPages: true,
});
console.log(`-- Working ${filename} --`); console.log(`-- Working ${filename} --`);
const loadingTask = pdfjsLib.default.getDocument(fullFilePath);
const pdf = await loadingTask.promise;
const numPages = pdf.numPages;
const pageContent = []; const pageContent = [];
const docs = await pdfLoader.load();
for (const doc of docs) { for (let i = 1; i <= numPages; i++) {
console.log( console.log(`-- Parsing content from pg ${i} --`);
`-- Parsing content from pg ${ const page = await pdf.getPage(i);
doc.metadata?.loc?.pageNumber || "unknown" const content = await page.getTextContent();
} --` const text = content.items.map((item) => item.str).join(" ");
);
if (!doc.pageContent || !doc.pageContent.length) continue; if (text.length) {
pageContent.push(doc.pageContent); pageContent.push(text);
}
} }
if (!pageContent.length) { if (!pageContent.length) {
...@@ -36,13 +38,15 @@ async function asPDF({ fullFilePath = "", filename = "" }) { ...@@ -36,13 +38,15 @@ async function asPDF({ fullFilePath = "", filename = "" }) {
}; };
} }
const content = pageContent.join(""); const content = pageContent.join(" ");
const metadata = await pdf.getMetadata();
const data = { const data = {
id: v4(), id: v4(),
url: "file://" + fullFilePath, url: "file://" + fullFilePath,
title: filename, title: filename,
docAuthor: docs[0]?.metadata?.pdf?.info?.Creator || "no author found", docAuthor: metadata?.info?.Creator || "no author found",
description: docs[0]?.metadata?.pdf?.info?.Title || "No description found.", description: metadata?.info?.Title || "No description found.",
docSource: "pdf file uploaded by the user.", docSource: "pdf file uploaded by the user.",
chunkSource: "", chunkSource: "",
published: createdDate(fullFilePath), published: createdDate(fullFilePath),
......
...@@ -108,7 +108,7 @@ ...@@ -108,7 +108,7 @@
"@langchain/core" "~0.1" "@langchain/core" "~0.1"
js-tiktoken "^1.0.11" js-tiktoken "^1.0.11"
"@mapbox/node-pre-gyp@^1.0.11": "@mapbox/node-pre-gyp@^1.0.0", "@mapbox/node-pre-gyp@^1.0.11":
version "1.0.11" version "1.0.11"
resolved "https://registry.yarnpkg.com/@mapbox/node-pre-gyp/-/node-pre-gyp-1.0.11.tgz#417db42b7f5323d79e93b34a6d7a2a12c0df43fa" resolved "https://registry.yarnpkg.com/@mapbox/node-pre-gyp/-/node-pre-gyp-1.0.11.tgz#417db42b7f5323d79e93b34a6d7a2a12c0df43fa"
integrity sha512-Yhlar6v9WQgUp/He7BdgzOz8lqMQ8sU+jkCq7Wx8Myc5YFJLbEe7lgui/V7G1qB1DJykHSGwreceSaD60Y0PUQ== integrity sha512-Yhlar6v9WQgUp/He7BdgzOz8lqMQ8sU+jkCq7Wx8Myc5YFJLbEe7lgui/V7G1qB1DJykHSGwreceSaD60Y0PUQ==
...@@ -643,6 +643,15 @@ camelcase@6: ...@@ -643,6 +643,15 @@ camelcase@6:
resolved "https://registry.yarnpkg.com/camelcase/-/camelcase-6.3.0.tgz#5685b95eb209ac9c0c177467778c9c84df58ba9a" resolved "https://registry.yarnpkg.com/camelcase/-/camelcase-6.3.0.tgz#5685b95eb209ac9c0c177467778c9c84df58ba9a"
integrity sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA== integrity sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA==
canvas@^2.11.0:
version "2.11.2"
resolved "https://registry.yarnpkg.com/canvas/-/canvas-2.11.2.tgz#553d87b1e0228c7ac0fc72887c3adbac4abbd860"
integrity sha512-ItanGBMrmRV7Py2Z+Xhs7cT+FNt5K0vPL4p9EZ/UX/Mu7hFbkxSjKF2KVtPwX7UYWp7dRKnrTvReflgrItJbdw==
dependencies:
"@mapbox/node-pre-gyp" "^1.0.0"
nan "^2.17.0"
simple-get "^3.0.3"
chalk@^2.4.2: chalk@^2.4.2:
version "2.4.2" version "2.4.2"
resolved "https://registry.yarnpkg.com/chalk/-/chalk-2.4.2.tgz#cd42541677a54333cf541a49108c1432b44c9424" resolved "https://registry.yarnpkg.com/chalk/-/chalk-2.4.2.tgz#cd42541677a54333cf541a49108c1432b44c9424"
...@@ -892,6 +901,13 @@ decamelize@1.2.0: ...@@ -892,6 +901,13 @@ decamelize@1.2.0:
resolved "https://registry.yarnpkg.com/decamelize/-/decamelize-1.2.0.tgz#f6534d15148269b20352e7bee26f501f9a191290" resolved "https://registry.yarnpkg.com/decamelize/-/decamelize-1.2.0.tgz#f6534d15148269b20352e7bee26f501f9a191290"
integrity sha512-z2S+W9X73hAUUki+N+9Za2lBlun89zigOyGrsax+KUQ6wKW4ZoWpEYBkGhQjwAjjDCkWxhY0VKEhk8wzY7F5cA== integrity sha512-z2S+W9X73hAUUki+N+9Za2lBlun89zigOyGrsax+KUQ6wKW4ZoWpEYBkGhQjwAjjDCkWxhY0VKEhk8wzY7F5cA==
decompress-response@^4.2.0:
version "4.2.1"
resolved "https://registry.yarnpkg.com/decompress-response/-/decompress-response-4.2.1.tgz#414023cc7a302da25ce2ec82d0d5238ccafd8986"
integrity sha512-jOSne2qbyE+/r8G1VU+G/82LBs2Fs4LAsTiLSHOCOMZQl2OKZ6i8i4IyHemTe+/yIXOtTcRQMzPcgyhoFlqPkw==
dependencies:
mimic-response "^2.0.0"
decompress-response@^6.0.0: decompress-response@^6.0.0:
version "6.0.0" version "6.0.0"
resolved "https://registry.yarnpkg.com/decompress-response/-/decompress-response-6.0.0.tgz#ca387612ddb7e104bd16d85aab00d5ecf09c66fc" resolved "https://registry.yarnpkg.com/decompress-response/-/decompress-response-6.0.0.tgz#ca387612ddb7e104bd16d85aab00d5ecf09c66fc"
...@@ -2154,6 +2170,11 @@ mime@^3.0.0: ...@@ -2154,6 +2170,11 @@ mime@^3.0.0:
resolved "https://registry.yarnpkg.com/mime/-/mime-3.0.0.tgz#b374550dca3a0c18443b0c950a6a58f1931cf7a7" resolved "https://registry.yarnpkg.com/mime/-/mime-3.0.0.tgz#b374550dca3a0c18443b0c950a6a58f1931cf7a7"
integrity sha512-jSCU7/VB1loIWBZe14aEYHU/+1UMEHoaO7qxCOVJOw9GgH72VAWppxNcjU+x9a2k3GSIBXNKxXQFqRvvZ7vr3A== integrity sha512-jSCU7/VB1loIWBZe14aEYHU/+1UMEHoaO7qxCOVJOw9GgH72VAWppxNcjU+x9a2k3GSIBXNKxXQFqRvvZ7vr3A==
mimic-response@^2.0.0:
version "2.1.0"
resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-2.1.0.tgz#d13763d35f613d09ec37ebb30bac0469c0ee8f43"
integrity sha512-wXqjST+SLt7R009ySCglWBCFpjUygmCIfD790/kVbiGmUgfYGuB14PiTd5DwVxSV4NcYHjzMkoj5LjQZwTQLEA==
mimic-response@^3.1.0: mimic-response@^3.1.0:
version "3.1.0" version "3.1.0"
resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-3.1.0.tgz#2d1d59af9c1b129815accc2c46a022a5ce1fa3c9" resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-3.1.0.tgz#2d1d59af9c1b129815accc2c46a022a5ce1fa3c9"
...@@ -2287,6 +2308,11 @@ mustache@^4.2.0: ...@@ -2287,6 +2308,11 @@ mustache@^4.2.0:
resolved "https://registry.yarnpkg.com/mustache/-/mustache-4.2.0.tgz#e5892324d60a12ec9c2a73359edca52972bf6f64" resolved "https://registry.yarnpkg.com/mustache/-/mustache-4.2.0.tgz#e5892324d60a12ec9c2a73359edca52972bf6f64"
integrity sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ== integrity sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ==
nan@^2.17.0:
version "2.20.0"
resolved "https://registry.yarnpkg.com/nan/-/nan-2.20.0.tgz#08c5ea813dd54ed16e5bd6505bf42af4f7838ca3"
integrity sha512-bk3gXBZDGILuuo/6sKtr0DQmSThYHLtNCdSdXk9YkxD/jK6X2vmCyyXBBxyqZ4XcnzTyYEAThfX3DCEnLf6igw==
napi-build-utils@^1.0.1: napi-build-utils@^1.0.1:
version "1.0.2" version "1.0.2"
resolved "https://registry.yarnpkg.com/napi-build-utils/-/napi-build-utils-1.0.2.tgz#b1fddc0b2c46e380a0b7a76f984dd47c41a13806" resolved "https://registry.yarnpkg.com/napi-build-utils/-/napi-build-utils-1.0.2.tgz#b1fddc0b2c46e380a0b7a76f984dd47c41a13806"
...@@ -2615,6 +2641,18 @@ path-type@^4.0.0: ...@@ -2615,6 +2641,18 @@ path-type@^4.0.0:
resolved "https://registry.yarnpkg.com/path-type/-/path-type-4.0.0.tgz#84ed01c0a7ba380afe09d90a8c180dcd9d03043b" resolved "https://registry.yarnpkg.com/path-type/-/path-type-4.0.0.tgz#84ed01c0a7ba380afe09d90a8c180dcd9d03043b"
integrity sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw== integrity sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==
path2d-polyfill@^2.0.1:
version "2.1.1"
resolved "https://registry.yarnpkg.com/path2d-polyfill/-/path2d-polyfill-2.1.1.tgz#6098b7bf2fc24c306c6377bcd558b17ba437ea27"
integrity sha512-4Rka5lN+rY/p0CdD8+E+BFv51lFaFvJOrlOhyQ+zjzyQrzyh3ozmxd1vVGGDdIbUFSBtIZLSnspxTgPT0iJhvA==
dependencies:
path2d "0.1.1"
path2d@0.1.1:
version "0.1.1"
resolved "https://registry.yarnpkg.com/path2d/-/path2d-0.1.1.tgz#d3c3886cd2252fb2a7830c27ea7bb9a862d937ea"
integrity sha512-/+S03c8AGsDYKKBtRDqieTJv2GlkMb0bWjnqOgtF6MkjdUQ9a8ARAtxWf9NgKLGm2+WQr6+/tqJdU8HNGsIDoA==
pdf-parse@^1.1.1: pdf-parse@^1.1.1:
version "1.1.1" version "1.1.1"
resolved "https://registry.yarnpkg.com/pdf-parse/-/pdf-parse-1.1.1.tgz#745e07408679548b3995ff896fd38e96e19d14a7" resolved "https://registry.yarnpkg.com/pdf-parse/-/pdf-parse-1.1.1.tgz#745e07408679548b3995ff896fd38e96e19d14a7"
...@@ -2623,6 +2661,16 @@ pdf-parse@^1.1.1: ...@@ -2623,6 +2661,16 @@ pdf-parse@^1.1.1:
debug "^3.1.0" debug "^3.1.0"
node-ensure "^0.0.0" node-ensure "^0.0.0"
pdfjs-dist@3.4.120:
version "3.4.120"
resolved "https://registry.yarnpkg.com/pdfjs-dist/-/pdfjs-dist-3.4.120.tgz#6f4222117157498f179c95dc4569fad6336a8fdd"
integrity sha512-B1hw9ilLG4m/jNeFA0C2A0PZydjxslP8ylU+I4XM7Bzh/xWETo9EiBV848lh0O0hLut7T6lK1V7cpAXv5BhxWw==
dependencies:
path2d-polyfill "^2.0.1"
web-streams-polyfill "^3.2.1"
optionalDependencies:
canvas "^2.11.0"
peberminta@^0.9.0: peberminta@^0.9.0:
version "0.9.0" version "0.9.0"
resolved "https://registry.yarnpkg.com/peberminta/-/peberminta-0.9.0.tgz#8ec9bc0eb84b7d368126e71ce9033501dca2a352" resolved "https://registry.yarnpkg.com/peberminta/-/peberminta-0.9.0.tgz#8ec9bc0eb84b7d368126e71ce9033501dca2a352"
...@@ -3048,6 +3096,15 @@ simple-concat@^1.0.0: ...@@ -3048,6 +3096,15 @@ simple-concat@^1.0.0:
resolved "https://registry.yarnpkg.com/simple-concat/-/simple-concat-1.0.1.tgz#f46976082ba35c2263f1c8ab5edfe26c41c9552f" resolved "https://registry.yarnpkg.com/simple-concat/-/simple-concat-1.0.1.tgz#f46976082ba35c2263f1c8ab5edfe26c41c9552f"
integrity sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q== integrity sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==
simple-get@^3.0.3:
version "3.1.1"
resolved "https://registry.yarnpkg.com/simple-get/-/simple-get-3.1.1.tgz#cc7ba77cfbe761036fbfce3d021af25fc5584d55"
integrity sha512-CQ5LTKGfCpvE1K0n2us+kuMPbk/q0EKl82s4aheV9oXjFEz6W/Y7oQFVJuU6QG77hRT4Ghb5RURteF5vnWjupA==
dependencies:
decompress-response "^4.2.0"
once "^1.3.1"
simple-concat "^1.0.0"
simple-get@^4.0.0, simple-get@^4.0.1: simple-get@^4.0.0, simple-get@^4.0.1:
version "4.0.1" version "4.0.1"
resolved "https://registry.yarnpkg.com/simple-get/-/simple-get-4.0.1.tgz#4a39db549287c979d352112fa03fd99fd6bc3543" resolved "https://registry.yarnpkg.com/simple-get/-/simple-get-4.0.1.tgz#4a39db549287c979d352112fa03fd99fd6bc3543"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment