From 4545ce24cdc1f53073b7350981f7f433d14b25ef Mon Sep 17 00:00:00 2001
From: Timothy Carambat <rambat1010@gmail.com>
Date: Fri, 14 Feb 2025 17:38:13 -0800
Subject: [PATCH] Drop Node `canvas` for manual `sharp` conversion (#3221)

* Drop Node `canvas` for manual `sharp` conversion

* bump dev
---
 .github/workflows/dev-build.yaml           |   2 +-
 collector/package.json                     |   3 +-
 collector/utils/OCRLoader/CanvasFactory.js |  52 ---------
 collector/utils/OCRLoader/index.js         | 126 +++++++++++++++------
 collector/yarn.lock                        |  37 +-----
 5 files changed, 94 insertions(+), 126 deletions(-)
 delete mode 100644 collector/utils/OCRLoader/CanvasFactory.js

diff --git a/.github/workflows/dev-build.yaml b/.github/workflows/dev-build.yaml
index 433643ae4..787305337 100644
--- a/.github/workflows/dev-build.yaml
+++ b/.github/workflows/dev-build.yaml
@@ -6,7 +6,7 @@ concurrency:
 
 on:
   push:
-    branches: ['ocr-parse-images'] # put your current branch to create a build. Core team only.
+    branches: ['sharp-pdf-image-converter'] # put your current branch to create a build. Core team only.
     paths-ignore:
       - '**.md'
       - 'cloud-deployments/*'
diff --git a/collector/package.json b/collector/package.json
index b67951df3..7de9338ab 100644
--- a/collector/package.json
+++ b/collector/package.json
@@ -19,7 +19,6 @@
     "@xenova/transformers": "^2.11.0",
     "bcrypt": "^5.1.0",
     "body-parser": "^1.20.2",
-    "canvas": "^2.11.2",
     "cors": "^2.8.5",
     "dotenv": "^16.0.3",
     "epub2": "^3.0.2",
@@ -52,4 +51,4 @@
     "nodemon": "^2.0.22",
     "prettier": "^2.4.1"
   }
-}
\ No newline at end of file
+}
diff --git a/collector/utils/OCRLoader/CanvasFactory.js b/collector/utils/OCRLoader/CanvasFactory.js
deleted file mode 100644
index 067917e51..000000000
--- a/collector/utils/OCRLoader/CanvasFactory.js
+++ /dev/null
@@ -1,52 +0,0 @@
-/**
- * This is a factory for creating a canvas and context in Node.js
- * it is used to create a canvas and context for the PDFLoader for turning the PDF into an image
- * so we can later use the image to extract text from the PDF.
- */
-class NodeCanvasFactory {
-  constructor() {
-    this.CanvasModule = null;
-  }
-
-  async init() {
-    this.CanvasModule = await import("canvas");
-    this.Image = this.CanvasModule.Image;
-  }
-
-  /**
-   * Creates a canvas and context for the PDFLoader
-   * @param {number} width - The width of the canvas
-   * @param {number} height - The height of the canvas
-   * @param {boolean} transparent - Whether the canvas is transparent
-   * @returns {{canvas: HTMLCanvasElement, context: CanvasRenderingContext2D}} - The canvas and context
-   */
-  create(width, height, transparent = false) {
-    const canvas = this.CanvasModule.createCanvas(width, height);
-    const context = canvas.getContext("2d", { alpha: transparent });
-    if (transparent) context.clearRect(0, 0, width, height);
-    return {
-      canvas,
-      context,
-    };
-  }
-
-  /**
-   * Required for the PDFLoader pdfjs interation - do not remove or use directly.
-   */
-  reset(canvasAndContext, width, height) {
-    canvasAndContext.canvas.width = width;
-    canvasAndContext.canvas.height = height;
-  }
-
-  /**
-   * Required for the PDFLoader pdfjs interation - do not remove or use directly.
-   */
-  destroy(canvasAndContext) {
-    canvasAndContext.canvas.width = 0;
-    canvasAndContext.canvas.height = 0;
-    canvasAndContext.canvas = null;
-    canvasAndContext.context = null;
-  }
-}
-
-module.exports = NodeCanvasFactory;
diff --git a/collector/utils/OCRLoader/index.js b/collector/utils/OCRLoader/index.js
index 88ac31e61..45f76506d 100644
--- a/collector/utils/OCRLoader/index.js
+++ b/collector/utils/OCRLoader/index.js
@@ -1,7 +1,6 @@
 const fs = require("fs");
 const os = require("os");
 const path = require("path");
-const NodeCanvasFactory = require("./CanvasFactory");
 
 class OCRLoader {
   constructor() {
@@ -38,15 +37,8 @@ class OCRLoader {
     this.log(`Starting OCR of ${documentTitle}`);
     const pdfjs = await import("pdf-parse/lib/pdf.js/v2.0.550/build/pdf.js");
     let buffer = fs.readFileSync(filePath);
-    const canvasFactory = new NodeCanvasFactory();
-    await canvasFactory.init();
-    global.Image = canvasFactory.Image;
 
-    const pdfDocument = await pdfjs.getDocument({
-      data: new Uint8Array(buffer),
-      canvasFactory,
-    }).promise;
-    buffer = null;
+    const pdfDocument = await pdfjs.getDocument({ data: buffer });
 
     const documents = [];
     const meta = await pdfDocument.getMetadata().catch(() => null);
@@ -60,30 +52,14 @@ class OCRLoader {
       },
     };
 
-    async function getPageAsBuffer(pageNumber, scale = 1) {
-      let canvas = null;
-      let context = null;
-      try {
-        const page = await pdfDocument.getPage(pageNumber);
-        const viewport = page.getViewport(scale);
-        ({ canvas, context } = canvasFactory.create(
-          viewport.width,
-          viewport.height
-        ));
-        await page.render({
-          canvasFactory,
-          canvasContext: context,
-          viewport,
-        }).promise;
-        return canvas.toBuffer();
-      } catch (e) {
-        this.log(`Error getting page as buffer: ${e.message}`);
-        return null;
-      } finally {
-        canvas = null;
-        context = null;
-      }
-    }
+    const pdfSharp = new PDFSharp({
+      validOps: [
+        pdfjs.OPS.paintJpegXObject,
+        pdfjs.OPS.paintImageXObject,
+        pdfjs.OPS.paintInlineImageXObject,
+      ],
+    });
+    await pdfSharp.init();
 
     const { createWorker, OEM } = require("tesseract.js");
     const BATCH_SIZE = batchSize;
@@ -143,7 +119,9 @@ class OCRLoader {
                   workerIndex + 1
                 }]\x1b[0m assigned pg${pageNum}`
               );
-              const imageBuffer = await getPageAsBuffer(pageNum, 5);
+              const page = await pdfDocument.getPage(pageNum);
+              const imageBuffer = await pdfSharp.pageToBuffer({ page });
+              if (!imageBuffer) continue;
               const { data } = await worker.recognize(imageBuffer, {}, "text");
               this.log(
                 `✅ \x1b[34m[Worker ${
@@ -172,7 +150,7 @@ class OCRLoader {
 
       await Promise.race([timeoutPromise, processPages()]);
     } catch (e) {
-      this.log(`Error: ${e.message}`);
+      this.log(`Error: ${e.message}`, e.stack);
     } finally {
       global.Image = undefined;
       await Promise.all(workerPool.map((worker) => worker.terminate()));
@@ -248,4 +226,82 @@ class OCRLoader {
   }
 }
 
+/**
+ * Converts a PDF page to a buffer using Sharp.
+ * @param {Object} options - The options for the Sharp PDF page object.
+ * @param {Object} options.page - The PDFJS page proxy object.
+ * @returns {Promise<Buffer>} The buffer of the page.
+ */
+class PDFSharp {
+  constructor({ validOps = [] } = {}) {
+    this.sharp = null;
+    this.validOps = validOps;
+  }
+
+  log(text, ...args) {
+    console.log(`\x1b[36m[PDFSharp]\x1b[0m ${text}`, ...args);
+  }
+
+  async init() {
+    this.sharp = (await import("sharp")).default;
+  }
+
+  /**
+   * Converts a PDF page to a buffer.
+   * @param {Object} options - The options for the Sharp PDF page object.
+   * @param {Object} options.page - The PDFJS page proxy object.
+   * @returns {Promise<Buffer>} The buffer of the page.
+   */
+  async pageToBuffer({ page }) {
+    if (!this.sharp) await this.init();
+    try {
+      this.log(`Converting page ${page.pageNumber} to image...`);
+      const ops = await page.getOperatorList();
+      const pageImages = ops.fnArray.length;
+
+      for (let i = 0; i < pageImages; i++) {
+        try {
+          if (!this.validOps.includes(ops.fnArray[i])) continue;
+
+          const name = ops.argsArray[i][0];
+          const img = await page.objs.get(name);
+          const { width, height } = img;
+          const size = img.data.length;
+          const channels = size / width / height;
+          const targetDPI = 70;
+          const targetWidth = Math.floor(width * (targetDPI / 72));
+          const targetHeight = Math.floor(height * (targetDPI / 72));
+
+          const image = this.sharp(img.data, {
+            raw: { width, height, channels },
+            density: targetDPI,
+          })
+            .resize({
+              width: targetWidth,
+              height: targetHeight,
+              fit: "fill",
+            })
+            .withMetadata({
+              density: targetDPI,
+              resolution: targetDPI,
+            })
+            .png();
+
+          // For debugging purposes
+          // await image.toFile(path.resolve(__dirname, `../../storage/`, `pg${page.pageNumber}.png`));
+          return await image.toBuffer();
+        } catch (error) {
+          this.log(`Iteration error: ${error.message}`, error.stack);
+          continue;
+        }
+      }
+      this.log(`No valid images found on page ${page.pageNumber}`);
+      return null;
+    } catch (error) {
+      this.log(`Error: ${error.message}`, error.stack);
+      return null;
+    }
+  }
+}
+
 module.exports = OCRLoader;
diff --git a/collector/yarn.lock b/collector/yarn.lock
index 611b39567..df7cf3126 100644
--- a/collector/yarn.lock
+++ b/collector/yarn.lock
@@ -280,7 +280,7 @@
     "@langchain/core" "~0.1"
     js-tiktoken "^1.0.11"
 
-"@mapbox/node-pre-gyp@^1.0.0", "@mapbox/node-pre-gyp@^1.0.11":
+"@mapbox/node-pre-gyp@^1.0.11":
   version "1.0.11"
   resolved "https://registry.yarnpkg.com/@mapbox/node-pre-gyp/-/node-pre-gyp-1.0.11.tgz#417db42b7f5323d79e93b34a6d7a2a12c0df43fa"
   integrity sha512-Yhlar6v9WQgUp/He7BdgzOz8lqMQ8sU+jkCq7Wx8Myc5YFJLbEe7lgui/V7G1qB1DJykHSGwreceSaD60Y0PUQ==
@@ -793,15 +793,6 @@ camelcase@6:
   resolved "https://registry.yarnpkg.com/camelcase/-/camelcase-6.3.0.tgz#5685b95eb209ac9c0c177467778c9c84df58ba9a"
   integrity sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA==
 
-canvas@^2.11.2:
-  version "2.11.2"
-  resolved "https://registry.yarnpkg.com/canvas/-/canvas-2.11.2.tgz#553d87b1e0228c7ac0fc72887c3adbac4abbd860"
-  integrity sha512-ItanGBMrmRV7Py2Z+Xhs7cT+FNt5K0vPL4p9EZ/UX/Mu7hFbkxSjKF2KVtPwX7UYWp7dRKnrTvReflgrItJbdw==
-  dependencies:
-    "@mapbox/node-pre-gyp" "^1.0.0"
-    nan "^2.17.0"
-    simple-get "^3.0.3"
-
 chalk@^2.4.2:
   version "2.4.2"
   resolved "https://registry.yarnpkg.com/chalk/-/chalk-2.4.2.tgz#cd42541677a54333cf541a49108c1432b44c9424"
@@ -1057,13 +1048,6 @@ decamelize@1.2.0:
   resolved "https://registry.yarnpkg.com/decamelize/-/decamelize-1.2.0.tgz#f6534d15148269b20352e7bee26f501f9a191290"
   integrity sha512-z2S+W9X73hAUUki+N+9Za2lBlun89zigOyGrsax+KUQ6wKW4ZoWpEYBkGhQjwAjjDCkWxhY0VKEhk8wzY7F5cA==
 
-decompress-response@^4.2.0:
-  version "4.2.1"
-  resolved "https://registry.yarnpkg.com/decompress-response/-/decompress-response-4.2.1.tgz#414023cc7a302da25ce2ec82d0d5238ccafd8986"
-  integrity sha512-jOSne2qbyE+/r8G1VU+G/82LBs2Fs4LAsTiLSHOCOMZQl2OKZ6i8i4IyHemTe+/yIXOtTcRQMzPcgyhoFlqPkw==
-  dependencies:
-    mimic-response "^2.0.0"
-
 decompress-response@^6.0.0:
   version "6.0.0"
   resolved "https://registry.yarnpkg.com/decompress-response/-/decompress-response-6.0.0.tgz#ca387612ddb7e104bd16d85aab00d5ecf09c66fc"
@@ -2307,11 +2291,6 @@ mime@^3.0.0:
   resolved "https://registry.yarnpkg.com/mime/-/mime-3.0.0.tgz#b374550dca3a0c18443b0c950a6a58f1931cf7a7"
   integrity sha512-jSCU7/VB1loIWBZe14aEYHU/+1UMEHoaO7qxCOVJOw9GgH72VAWppxNcjU+x9a2k3GSIBXNKxXQFqRvvZ7vr3A==
 
-mimic-response@^2.0.0:
-  version "2.1.0"
-  resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-2.1.0.tgz#d13763d35f613d09ec37ebb30bac0469c0ee8f43"
-  integrity sha512-wXqjST+SLt7R009ySCglWBCFpjUygmCIfD790/kVbiGmUgfYGuB14PiTd5DwVxSV4NcYHjzMkoj5LjQZwTQLEA==
-
 mimic-response@^3.1.0:
   version "3.1.0"
   resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-3.1.0.tgz#2d1d59af9c1b129815accc2c46a022a5ce1fa3c9"
@@ -2425,11 +2404,6 @@ mustache@^4.2.0:
   resolved "https://registry.yarnpkg.com/mustache/-/mustache-4.2.0.tgz#e5892324d60a12ec9c2a73359edca52972bf6f64"
   integrity sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ==
 
-nan@^2.17.0:
-  version "2.22.0"
-  resolved "https://registry.yarnpkg.com/nan/-/nan-2.22.0.tgz#31bc433fc33213c97bad36404bb68063de604de3"
-  integrity sha512-nbajikzWTMwsW+eSsNm3QwlOs7het9gGJU5dDZzRTQGk03vyBOauxgI4VakDzE0PtsGTmXPsXTbbjVhRwR5mpw==
-
 napi-build-utils@^1.0.1:
   version "1.0.2"
   resolved "https://registry.yarnpkg.com/napi-build-utils/-/napi-build-utils-1.0.2.tgz#b1fddc0b2c46e380a0b7a76f984dd47c41a13806"
@@ -3255,15 +3229,6 @@ simple-concat@^1.0.0:
   resolved "https://registry.yarnpkg.com/simple-concat/-/simple-concat-1.0.1.tgz#f46976082ba35c2263f1c8ab5edfe26c41c9552f"
   integrity sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==
 
-simple-get@^3.0.3:
-  version "3.1.1"
-  resolved "https://registry.yarnpkg.com/simple-get/-/simple-get-3.1.1.tgz#cc7ba77cfbe761036fbfce3d021af25fc5584d55"
-  integrity sha512-CQ5LTKGfCpvE1K0n2us+kuMPbk/q0EKl82s4aheV9oXjFEz6W/Y7oQFVJuU6QG77hRT4Ghb5RURteF5vnWjupA==
-  dependencies:
-    decompress-response "^4.2.0"
-    once "^1.3.1"
-    simple-concat "^1.0.0"
-
 simple-get@^4.0.0, simple-get@^4.0.1:
   version "4.0.1"
   resolved "https://registry.yarnpkg.com/simple-get/-/simple-get-4.0.1.tgz#4a39db549287c979d352112fa03fd99fd6bc3543"
-- 
GitLab