From 89bba6821942c8cc401cc5fac203b99f3c2ec14f Mon Sep 17 00:00:00 2001
From: Timothy Carambat <rambat1010@gmail.com>
Date: Fri, 14 Feb 2025 12:07:33 -0800
Subject: [PATCH] Add OCR of image support (#3219)

* OCR PDFs as fallback in spawn thread

* wip

* build our own worker fanout and wrapper

* norm pkgs

* Add image OCR support
---
 .github/workflows/dev-build.yaml              |  2 +-
 .../processSingleFile/convert/asImage.js      | 48 +++++++++++++++
 collector/utils/OCRLoader/index.js            | 61 +++++++++++++++++++
 collector/utils/constants.js                  |  7 +++
 collector/utils/files/mime.js                 |  2 +-
 5 files changed, 118 insertions(+), 2 deletions(-)
 create mode 100644 collector/processSingleFile/convert/asImage.js

diff --git a/.github/workflows/dev-build.yaml b/.github/workflows/dev-build.yaml
index 62e94e807..433643ae4 100644
--- a/.github/workflows/dev-build.yaml
+++ b/.github/workflows/dev-build.yaml
@@ -6,7 +6,7 @@ concurrency:
 
 on:
   push:
-    branches: ['ocr-parse-pdfs'] # put your current branch to create a build. Core team only.
+    branches: ['ocr-parse-images'] # put your current branch to create a build. Core team only.
     paths-ignore:
       - '**.md'
       - 'cloud-deployments/*'
diff --git a/collector/processSingleFile/convert/asImage.js b/collector/processSingleFile/convert/asImage.js
new file mode 100644
index 000000000..57b6b7ed0
--- /dev/null
+++ b/collector/processSingleFile/convert/asImage.js
@@ -0,0 +1,48 @@
+const { v4 } = require("uuid");
+const { tokenizeString } = require("../../utils/tokenizer");
+const {
+  createdDate,
+  trashFile,
+  writeToServerDocuments,
+} = require("../../utils/files");
+const OCRLoader = require("../../utils/OCRLoader");
+const { default: slugify } = require("slugify");
+
+async function asImage({ fullFilePath = "", filename = "" }) {
+  let content = await new OCRLoader().ocrImage(fullFilePath);
+
+  if (!content?.length) {
+    console.error(`Resulting text content was empty for ${filename}.`);
+    trashFile(fullFilePath);
+    return {
+      success: false,
+      reason: `No text content found in ${filename}.`,
+      documents: [],
+    };
+  }
+
+  console.log(`-- Working ${filename} --`);
+  const data = {
+    id: v4(),
+    url: "file://" + fullFilePath,
+    title: filename,
+    docAuthor: "Unknown", // TODO: Find a better author
+    description: "Unknown", // TODO: Find a better description
+    docSource: "a text file uploaded by the user.",
+    chunkSource: "",
+    published: createdDate(fullFilePath),
+    wordCount: content.split(" ").length,
+    pageContent: content,
+    token_count_estimate: tokenizeString(content),
+  };
+
+  const document = writeToServerDocuments(
+    data,
+    `${slugify(filename)}-${data.id}`
+  );
+  trashFile(fullFilePath);
+  console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`);
+  return { success: true, reason: null, documents: [document] };
+}
+
+module.exports = asImage;
diff --git a/collector/utils/OCRLoader/index.js b/collector/utils/OCRLoader/index.js
index 725033b61..88ac31e61 100644
--- a/collector/utils/OCRLoader/index.js
+++ b/collector/utils/OCRLoader/index.js
@@ -185,6 +185,67 @@ class OCRLoader {
     });
     return documents;
   }
+
+  /**
+   * Loads an image file and returns the OCRed text.
+   * @param {string} filePath - The path to the image file.
+   * @param {Object} options - The options for the OCR.
+   * @param {number} options.maxExecutionTime - The maximum execution time of the OCR in milliseconds.
+   * @returns {Promise<string>} The OCRed text.
+   */
+  async ocrImage(filePath, { maxExecutionTime = 300_000 } = {}) {
+    let content = "";
+    let worker = null;
+    if (
+      !filePath ||
+      !fs.existsSync(filePath) ||
+      !fs.statSync(filePath).isFile()
+    ) {
+      this.log(`File ${filePath} does not exist. Skipping OCR.`);
+      return null;
+    }
+
+    const documentTitle = path.basename(filePath);
+    try {
+      this.log(`Starting OCR of ${documentTitle}`);
+      const startTime = Date.now();
+      const { createWorker, OEM } = require("tesseract.js");
+      worker = await createWorker("eng", OEM.LSTM_ONLY, {
+        cachePath: this.cacheDir,
+      });
+
+      // Race the timeout with the OCR
+      const timeoutPromise = new Promise((_, reject) => {
+        setTimeout(() => {
+          reject(
+            new Error(
+              `OCR job took too long to complete (${
+                maxExecutionTime / 1000
+              } seconds)`
+            )
+          );
+        }, maxExecutionTime);
+      });
+
+      const processImage = async () => {
+        const { data } = await worker.recognize(filePath, {}, "text");
+        content = data.text;
+      };
+
+      await Promise.race([timeoutPromise, processImage()]);
+      this.log(`Completed OCR of ${documentTitle}!`, {
+        executionTime: `${((Date.now() - startTime) / 1000).toFixed(2)}s`,
+      });
+
+      return content;
+    } catch (e) {
+      this.log(`Error: ${e.message}`);
+      return null;
+    } finally {
+      if (!worker) return;
+      await worker.terminate();
+    }
+  }
 }
 
 module.exports = OCRLoader;
diff --git a/collector/utils/constants.js b/collector/utils/constants.js
index c7beeb4b2..236fc2fc9 100644
--- a/collector/utils/constants.js
+++ b/collector/utils/constants.js
@@ -27,6 +27,9 @@ const ACCEPTED_MIMES = {
   "video/mp4": [".mp4"],
   "video/mpeg": [".mpeg"],
   "application/epub+zip": [".epub"],
+  "image/png": [".png"],
+  "image/jpeg": [".jpg"],
+  "image/jpg": [".jpg"],
 };
 
 const SUPPORTED_FILETYPE_CONVERTERS = {
@@ -55,6 +58,10 @@ const SUPPORTED_FILETYPE_CONVERTERS = {
   ".wav": "./convert/asAudio.js",
   ".mp4": "./convert/asAudio.js",
   ".mpeg": "./convert/asAudio.js",
+
+  ".png": "./convert/asImage.js",
+  ".jpg": "./convert/asImage.js",
+  ".jpeg": "./convert/asImage.js",
 };
 
 module.exports = {
diff --git a/collector/utils/files/mime.js b/collector/utils/files/mime.js
index 9bf22c222..bd9549653 100644
--- a/collector/utils/files/mime.js
+++ b/collector/utils/files/mime.js
@@ -1,6 +1,6 @@
 const MimeLib = require("mime");
 class MimeDetector {
-  nonTextTypes = ["multipart", "image", "model", "audio", "video", "font"];
+  nonTextTypes = ["multipart", "model", "audio", "video", "font"];
   badMimes = [
     "application/octet-stream",
     "application/zip",
-- 
GitLab