diff --git a/collector/processSingleFile/convert/asImage.js b/collector/processSingleFile/convert/asImage.js index 57b6b7ed0f0bb6551865706f75fd442450ba5aca..d2d197b227a823d34dc747ae4295fd241acf3308 100644 --- a/collector/processSingleFile/convert/asImage.js +++ b/collector/processSingleFile/convert/asImage.js @@ -8,8 +8,10 @@ const { const OCRLoader = require("../../utils/OCRLoader"); const { default: slugify } = require("slugify"); -async function asImage({ fullFilePath = "", filename = "" }) { - let content = await new OCRLoader().ocrImage(fullFilePath); +async function asImage({ fullFilePath = "", filename = "", options = {} }) { + let content = await new OCRLoader({ + targetLanguages: options?.ocr?.langList, + }).ocrImage(fullFilePath); if (!content?.length) { console.error(`Resulting text content was empty for ${filename}.`); diff --git a/collector/processSingleFile/convert/asPDF/index.js b/collector/processSingleFile/convert/asPDF/index.js index 350f217f4fa2322f3c66b7872b39e83213a9e613..b929fbda16f76965048acefe83ee6d7666ecd50e 100644 --- a/collector/processSingleFile/convert/asPDF/index.js +++ b/collector/processSingleFile/convert/asPDF/index.js @@ -9,7 +9,7 @@ const { default: slugify } = require("slugify"); const PDFLoader = require("./PDFLoader"); const OCRLoader = require("../../../utils/OCRLoader"); -async function asPdf({ fullFilePath = "", filename = "" }) { +async function asPdf({ fullFilePath = "", filename = "", options = {} }) { const pdfLoader = new PDFLoader(fullFilePath, { splitPages: true, }); @@ -22,7 +22,9 @@ async function asPdf({ fullFilePath = "", filename = "" }) { console.log( `[asPDF] No text content found for ${filename}. Will attempt OCR parse.` ); - docs = await new OCRLoader().ocrPDF(fullFilePath); + docs = await new OCRLoader({ + targetLanguages: options?.ocr?.langList, + }).ocrPDF(fullFilePath); } for (const doc of docs) { diff --git a/collector/utils/OCRLoader/index.js b/collector/utils/OCRLoader/index.js index 45f76506d356c603d9345dd11f2efe4772db55df..1c952d6f887c2b8ed8f5cc56521e4d731cedeb90 100644 --- a/collector/utils/OCRLoader/index.js +++ b/collector/utils/OCRLoader/index.js @@ -1,14 +1,61 @@ const fs = require("fs"); const os = require("os"); const path = require("path"); +const { VALID_LANGUAGE_CODES } = require("./validLangs"); class OCRLoader { - constructor() { + /** + * The language code(s) to use for the OCR. + * @type {string[]} + */ + language; + /** + * The cache directory for the OCR. + * @type {string} + */ + cacheDir; + + /** + * The constructor for the OCRLoader. + * @param {Object} options - The options for the OCRLoader. + * @param {string} options.targetLanguages - The target languages to use for the OCR as a comma separated string. eg: "eng,deu,..." + */ + constructor({ targetLanguages = "eng" } = {}) { + this.language = this.parseLanguages(targetLanguages); this.cacheDir = path.resolve( process.env.STORAGE_DIR ? path.resolve(process.env.STORAGE_DIR, `models`, `tesseract`) : path.resolve(__dirname, `../../../server/storage/models/tesseract`) ); + + // Ensure the cache directory exists or else Tesseract will persist the cache in the default location. + if (!fs.existsSync(this.cacheDir)) + fs.mkdirSync(this.cacheDir, { recursive: true }); + this.log( + `OCRLoader initialized with language support for:`, + this.language.map((lang) => VALID_LANGUAGE_CODES[lang]).join(", ") + ); + } + + /** + * Parses the language code from a provided comma separated string of language codes. + * @param {string} language - The language code to parse. + * @returns {string[]} The parsed language code. + */ + parseLanguages(language = null) { + try { + if (!language || typeof language !== "string") return ["eng"]; + const langList = language + .split(",") + .map((lang) => (lang.trim() !== "" ? lang.trim() : null)) + .filter(Boolean) + .filter((lang) => VALID_LANGUAGE_CODES.hasOwnProperty(lang)); + if (langList.length === 0) return ["eng"]; + return langList; + } catch (e) { + this.log(`Error parsing languages: ${e.message}`, e.stack); + return ["eng"]; + } } log(text, ...args) { @@ -70,7 +117,7 @@ class OCRLoader { Array(NUM_WORKERS) .fill(0) .map(() => - createWorker("eng", OEM.LSTM_ONLY, { + createWorker(this.language, OEM.LSTM_ONLY, { cachePath: this.cacheDir, }) ) @@ -188,7 +235,7 @@ class OCRLoader { this.log(`Starting OCR of ${documentTitle}`); const startTime = Date.now(); const { createWorker, OEM } = require("tesseract.js"); - worker = await createWorker("eng", OEM.LSTM_ONLY, { + worker = await createWorker(this.language, OEM.LSTM_ONLY, { cachePath: this.cacheDir, }); diff --git a/collector/utils/OCRLoader/validLangs.js b/collector/utils/OCRLoader/validLangs.js new file mode 100644 index 0000000000000000000000000000000000000000..5bc807ade68561fdfa6c7b49fc17f12bc2f21431 --- /dev/null +++ b/collector/utils/OCRLoader/validLangs.js @@ -0,0 +1,155 @@ +/* + +To get the list of valid language codes - do the following: +Open the following URL in your browser: https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html + +Check this element is the proper table tbody with all the codes via console: +document.getElementsByTagName('table').item(0).children.item(1) + +Now, copy the following code and paste it into the console: +function parseLangs() { +let langs = {}; + Array.from(document.getElementsByTagName('table').item(0).children.item(1).children).forEach((el) => { + const [codeEl, languageEl, ...rest] = el.children + const code = codeEl.innerText.trim() + const language = languageEl.innerText.trim() + if (!!code && !!language) langs[code] = language + }) + return langs; +} + +now, run the function: +copy(parseLangs()) +*/ + +const VALID_LANGUAGE_CODES = { + afr: "Afrikaans", + amh: "Amharic", + ara: "Arabic", + asm: "Assamese", + aze: "Azerbaijani", + aze_cyrl: "Azerbaijani - Cyrilic", + bel: "Belarusian", + ben: "Bengali", + bod: "Tibetan", + bos: "Bosnian", + bre: "Breton", + bul: "Bulgarian", + cat: "Catalan; Valencian", + ceb: "Cebuano", + ces: "Czech", + chi_sim: "Chinese - Simplified", + chi_tra: "Chinese - Traditional", + chr: "Cherokee", + cos: "Corsican", + cym: "Welsh", + dan: "Danish", + dan_frak: "Danish - Fraktur (contrib)", + deu: "German", + deu_frak: "German - Fraktur (contrib)", + deu_latf: "German (Fraktur Latin)", + dzo: "Dzongkha", + ell: "Greek, Modern (1453-)", + eng: "English", + enm: "English, Middle (1100-1500)", + epo: "Esperanto", + equ: "Math / equation detection module", + est: "Estonian", + eus: "Basque", + fao: "Faroese", + fas: "Persian", + fil: "Filipino (old - Tagalog)", + fin: "Finnish", + fra: "French", + frk: "German - Fraktur (now deu_latf)", + frm: "French, Middle (ca.1400-1600)", + fry: "Western Frisian", + gla: "Scottish Gaelic", + gle: "Irish", + glg: "Galician", + grc: "Greek, Ancient (to 1453) (contrib)", + guj: "Gujarati", + hat: "Haitian; Haitian Creole", + heb: "Hebrew", + hin: "Hindi", + hrv: "Croatian", + hun: "Hungarian", + hye: "Armenian", + iku: "Inuktitut", + ind: "Indonesian", + isl: "Icelandic", + ita: "Italian", + ita_old: "Italian - Old", + jav: "Javanese", + jpn: "Japanese", + kan: "Kannada", + kat: "Georgian", + kat_old: "Georgian - Old", + kaz: "Kazakh", + khm: "Central Khmer", + kir: "Kirghiz; Kyrgyz", + kmr: "Kurmanji (Kurdish - Latin Script)", + kor: "Korean", + kor_vert: "Korean (vertical)", + kur: "Kurdish (Arabic Script)", + lao: "Lao", + lat: "Latin", + lav: "Latvian", + lit: "Lithuanian", + ltz: "Luxembourgish", + mal: "Malayalam", + mar: "Marathi", + mkd: "Macedonian", + mlt: "Maltese", + mon: "Mongolian", + mri: "Maori", + msa: "Malay", + mya: "Burmese", + nep: "Nepali", + nld: "Dutch; Flemish", + nor: "Norwegian", + oci: "Occitan (post 1500)", + ori: "Oriya", + osd: "Orientation and script detection module", + pan: "Panjabi; Punjabi", + pol: "Polish", + por: "Portuguese", + pus: "Pushto; Pashto", + que: "Quechua", + ron: "Romanian; Moldavian; Moldovan", + rus: "Russian", + san: "Sanskrit", + sin: "Sinhala; Sinhalese", + slk: "Slovak", + slk_frak: "Slovak - Fraktur (contrib)", + slv: "Slovenian", + snd: "Sindhi", + spa: "Spanish; Castilian", + spa_old: "Spanish; Castilian - Old", + sqi: "Albanian", + srp: "Serbian", + srp_latn: "Serbian - Latin", + sun: "Sundanese", + swa: "Swahili", + swe: "Swedish", + syr: "Syriac", + tam: "Tamil", + tat: "Tatar", + tel: "Telugu", + tgk: "Tajik", + tgl: "Tagalog (new - Filipino)", + tha: "Thai", + tir: "Tigrinya", + ton: "Tonga", + tur: "Turkish", + uig: "Uighur; Uyghur", + ukr: "Ukrainian", + urd: "Urdu", + uzb: "Uzbek", + uzb_cyrl: "Uzbek - Cyrilic", + vie: "Vietnamese", + yid: "Yiddish", + yor: "Yoruba", +}; + +module.exports.VALID_LANGUAGE_CODES = VALID_LANGUAGE_CODES; diff --git a/docker/.env.example b/docker/.env.example index 40acac84b68e853533cadc01a4ebf401c6ed127b..a5358c6684bf0a4c006de82e632aa8548bc85f5c 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -321,3 +321,8 @@ GID='1000' # Enable simple SSO passthrough to pre-authenticate users from a third party service. # See https://docs.anythingllm.com/configuration#simple-sso-passthrough for more information. # SIMPLE_SSO_ENABLED=1 + +# Specify the target languages for when using OCR to parse images and PDFs. +# This is a comma separated list of language codes as a string. Unsupported languages will be ignored. +# Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes. +# TARGET_OCR_LANG=eng,deu,ita,spa,fra,por,rus,nld,tur,hun,pol,ita,spa,fra,por,rus,nld,tur,hun,pol \ No newline at end of file diff --git a/server/.env.example b/server/.env.example index f5fa69a34f49bf2e0b7e2e35827532e342f00643..cfd17789ccf83350a6f4ae8a22e304ed4d816e5b 100644 --- a/server/.env.example +++ b/server/.env.example @@ -310,3 +310,8 @@ TTS_PROVIDER="native" # Enable simple SSO passthrough to pre-authenticate users from a third party service. # See https://docs.anythingllm.com/configuration#simple-sso-passthrough for more information. # SIMPLE_SSO_ENABLED=1 + +# Specify the target languages for when using OCR to parse images and PDFs. +# This is a comma separated list of language codes as a string. Unsupported languages will be ignored. +# Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes. +# TARGET_OCR_LANG=eng,deu,ita,spa,fra,por,rus,nld,tur,hun,pol,ita,spa,fra,por,rus,nld,tur,hun,pol \ No newline at end of file diff --git a/server/utils/collectorApi/index.js b/server/utils/collectorApi/index.js index 22e2bcd9d6ddd1d67c158077e5d282572af37c56..c6aed9ad747dcc175591e775ea4e503197741b76 100644 --- a/server/utils/collectorApi/index.js +++ b/server/utils/collectorApi/index.js @@ -20,6 +20,9 @@ class CollectorApi { whisperProvider: process.env.WHISPER_PROVIDER || "local", WhisperModelPref: process.env.WHISPER_MODEL_PREF, openAiKey: process.env.OPEN_AI_KEY || null, + ocr: { + langList: process.env.TARGET_OCR_LANG || "eng", + }, }; } diff --git a/server/utils/helpers/updateENV.js b/server/utils/helpers/updateENV.js index ab76cb15fcfa6adc5c6ce8fc8ad685ad86192060..d50118bef679fe42a54efa8cfadc5ce57a7d90ad 100644 --- a/server/utils/helpers/updateENV.js +++ b/server/utils/helpers/updateENV.js @@ -978,6 +978,9 @@ function dumpENV() { // Nvidia NIM Keys that are automatically managed "NVIDIA_NIM_LLM_MODEL_TOKEN_LIMIT", + + // OCR Language Support + "TARGET_OCR_LANG", ]; // Simple sanitization of each value to prevent ENV injection via newline or quote escaping.