From 89bba6821942c8cc401cc5fac203b99f3c2ec14f Mon Sep 17 00:00:00 2001 From: Timothy Carambat <rambat1010@gmail.com> Date: Fri, 14 Feb 2025 12:07:33 -0800 Subject: [PATCH] Add OCR of image support (#3219) * OCR PDFs as fallback in spawn thread * wip * build our own worker fanout and wrapper * norm pkgs * Add image OCR support --- .github/workflows/dev-build.yaml | 2 +- .../processSingleFile/convert/asImage.js | 48 +++++++++++++++ collector/utils/OCRLoader/index.js | 61 +++++++++++++++++++ collector/utils/constants.js | 7 +++ collector/utils/files/mime.js | 2 +- 5 files changed, 118 insertions(+), 2 deletions(-) create mode 100644 collector/processSingleFile/convert/asImage.js diff --git a/.github/workflows/dev-build.yaml b/.github/workflows/dev-build.yaml index 62e94e807..433643ae4 100644 --- a/.github/workflows/dev-build.yaml +++ b/.github/workflows/dev-build.yaml @@ -6,7 +6,7 @@ concurrency: on: push: - branches: ['ocr-parse-pdfs'] # put your current branch to create a build. Core team only. + branches: ['ocr-parse-images'] # put your current branch to create a build. Core team only. paths-ignore: - '**.md' - 'cloud-deployments/*' diff --git a/collector/processSingleFile/convert/asImage.js b/collector/processSingleFile/convert/asImage.js new file mode 100644 index 000000000..57b6b7ed0 --- /dev/null +++ b/collector/processSingleFile/convert/asImage.js @@ -0,0 +1,48 @@ +const { v4 } = require("uuid"); +const { tokenizeString } = require("../../utils/tokenizer"); +const { + createdDate, + trashFile, + writeToServerDocuments, +} = require("../../utils/files"); +const OCRLoader = require("../../utils/OCRLoader"); +const { default: slugify } = require("slugify"); + +async function asImage({ fullFilePath = "", filename = "" }) { + let content = await new OCRLoader().ocrImage(fullFilePath); + + if (!content?.length) { + console.error(`Resulting text content was empty for ${filename}.`); + trashFile(fullFilePath); + return { + success: false, + reason: `No text content found in ${filename}.`, + documents: [], + }; + } + + console.log(`-- Working ${filename} --`); + const data = { + id: v4(), + url: "file://" + fullFilePath, + title: filename, + docAuthor: "Unknown", // TODO: Find a better author + description: "Unknown", // TODO: Find a better description + docSource: "a text file uploaded by the user.", + chunkSource: "", + published: createdDate(fullFilePath), + wordCount: content.split(" ").length, + pageContent: content, + token_count_estimate: tokenizeString(content), + }; + + const document = writeToServerDocuments( + data, + `${slugify(filename)}-${data.id}` + ); + trashFile(fullFilePath); + console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); + return { success: true, reason: null, documents: [document] }; +} + +module.exports = asImage; diff --git a/collector/utils/OCRLoader/index.js b/collector/utils/OCRLoader/index.js index 725033b61..88ac31e61 100644 --- a/collector/utils/OCRLoader/index.js +++ b/collector/utils/OCRLoader/index.js @@ -185,6 +185,67 @@ class OCRLoader { }); return documents; } + + /** + * Loads an image file and returns the OCRed text. + * @param {string} filePath - The path to the image file. + * @param {Object} options - The options for the OCR. + * @param {number} options.maxExecutionTime - The maximum execution time of the OCR in milliseconds. + * @returns {Promise<string>} The OCRed text. + */ + async ocrImage(filePath, { maxExecutionTime = 300_000 } = {}) { + let content = ""; + let worker = null; + if ( + !filePath || + !fs.existsSync(filePath) || + !fs.statSync(filePath).isFile() + ) { + this.log(`File ${filePath} does not exist. Skipping OCR.`); + return null; + } + + const documentTitle = path.basename(filePath); + try { + this.log(`Starting OCR of ${documentTitle}`); + const startTime = Date.now(); + const { createWorker, OEM } = require("tesseract.js"); + worker = await createWorker("eng", OEM.LSTM_ONLY, { + cachePath: this.cacheDir, + }); + + // Race the timeout with the OCR + const timeoutPromise = new Promise((_, reject) => { + setTimeout(() => { + reject( + new Error( + `OCR job took too long to complete (${ + maxExecutionTime / 1000 + } seconds)` + ) + ); + }, maxExecutionTime); + }); + + const processImage = async () => { + const { data } = await worker.recognize(filePath, {}, "text"); + content = data.text; + }; + + await Promise.race([timeoutPromise, processImage()]); + this.log(`Completed OCR of ${documentTitle}!`, { + executionTime: `${((Date.now() - startTime) / 1000).toFixed(2)}s`, + }); + + return content; + } catch (e) { + this.log(`Error: ${e.message}`); + return null; + } finally { + if (!worker) return; + await worker.terminate(); + } + } } module.exports = OCRLoader; diff --git a/collector/utils/constants.js b/collector/utils/constants.js index c7beeb4b2..236fc2fc9 100644 --- a/collector/utils/constants.js +++ b/collector/utils/constants.js @@ -27,6 +27,9 @@ const ACCEPTED_MIMES = { "video/mp4": [".mp4"], "video/mpeg": [".mpeg"], "application/epub+zip": [".epub"], + "image/png": [".png"], + "image/jpeg": [".jpg"], + "image/jpg": [".jpg"], }; const SUPPORTED_FILETYPE_CONVERTERS = { @@ -55,6 +58,10 @@ const SUPPORTED_FILETYPE_CONVERTERS = { ".wav": "./convert/asAudio.js", ".mp4": "./convert/asAudio.js", ".mpeg": "./convert/asAudio.js", + + ".png": "./convert/asImage.js", + ".jpg": "./convert/asImage.js", + ".jpeg": "./convert/asImage.js", }; module.exports = { diff --git a/collector/utils/files/mime.js b/collector/utils/files/mime.js index 9bf22c222..bd9549653 100644 --- a/collector/utils/files/mime.js +++ b/collector/utils/files/mime.js @@ -1,6 +1,6 @@ const MimeLib = require("mime"); class MimeDetector { - nonTextTypes = ["multipart", "image", "model", "audio", "video", "font"]; + nonTextTypes = ["multipart", "model", "audio", "video", "font"]; badMimes = [ "application/octet-stream", "application/zip", -- GitLab