diff --git a/collector/package.json b/collector/package.json index bf6498c065ae74b9137e96e25cb8596e0aa34e97..1509645a10481893ba21cc90230a2dfdd110eac8 100644 --- a/collector/package.json +++ b/collector/package.json @@ -38,6 +38,7 @@ "openai": "4.38.5", "pdf-parse": "^1.1.1", "puppeteer": "~21.5.2", + "sharp": "^0.33.5", "slugify": "^1.6.6", "url-pattern": "^1.0.3", "uuid": "^9.0.0", diff --git a/collector/utils/WhisperProviders/localWhisper.js b/collector/utils/WhisperProviders/localWhisper.js index af13c8a9255dc7e3a339193a156142fca5f71d71..05ebab2812ac73ce390a52f7fdd887dc66860c88 100644 --- a/collector/utils/WhisperProviders/localWhisper.js +++ b/collector/utils/WhisperProviders/localWhisper.js @@ -29,6 +29,37 @@ class LocalWhisper { console.log(`\x1b[32m[LocalWhisper]\x1b[0m ${text}`, ...args); } + #validateAudioFile(wavFile) { + const sampleRate = wavFile.fmt.sampleRate; + const duration = wavFile.data.samples / sampleRate; + + // Most speech recognition systems expect minimum 8kHz + // But we'll set it lower to be safe + if (sampleRate < 4000) { + // 4kHz minimum + throw new Error( + "Audio file sample rate is too low for accurate transcription. Minimum required is 4kHz." + ); + } + + // Typical audio file duration limits + const MAX_DURATION_SECONDS = 4 * 60 * 60; // 4 hours + if (duration > MAX_DURATION_SECONDS) { + throw new Error("Audio file duration exceeds maximum limit of 4 hours."); + } + + // Check final sample count after upsampling to prevent memory issues + const targetSampleRate = 16000; + const upsampledSamples = duration * targetSampleRate; + const MAX_SAMPLES = 230_400_000; // ~4 hours at 16kHz + + if (upsampledSamples > MAX_SAMPLES) { + throw new Error("Audio file exceeds maximum allowed length."); + } + + return true; + } + async #convertToWavAudioData(sourcePath) { try { let buffer; @@ -81,6 +112,13 @@ class LocalWhisper { } const wavFile = new wavefile.WaveFile(buffer); + try { + this.#validateAudioFile(wavFile); + } catch (error) { + this.#log(`Audio validation failed: ${error.message}`); + throw new Error(`Invalid audio file: ${error.message}`); + } + wavFile.toBitDepth("32f"); wavFile.toSampleRate(16000); diff --git a/collector/yarn.lock b/collector/yarn.lock index f991b43faecede250297487090c3d7c6ffa5df74..402fee3ea7e1902b54d4786f07cfb081377852d8 100644 --- a/collector/yarn.lock +++ b/collector/yarn.lock @@ -54,6 +54,13 @@ enabled "2.0.x" kuler "^2.0.0" +"@emnapi/runtime@^1.2.0": + version "1.3.1" + resolved "https://registry.yarnpkg.com/@emnapi/runtime/-/runtime-1.3.1.tgz#0fcaa575afc31f455fd33534c19381cfce6c6f60" + integrity sha512-kEBmG8KyqtxJZv+ygbEim+KCGtIq1fC22Ms3S4ziXmYKm8uyoLX0MHONVKwp+9opg390VaKRNt4a7A9NwmpNhw== + dependencies: + tslib "^2.4.0" + "@fastify/busboy@^2.0.0": version "2.1.1" resolved "https://registry.yarnpkg.com/@fastify/busboy/-/busboy-2.1.1.tgz#b9da6a878a371829a0502c9b6c1c143ef6663f4d" @@ -64,6 +71,119 @@ resolved "https://registry.yarnpkg.com/@huggingface/jinja/-/jinja-0.2.2.tgz#faeb205a9d6995089bef52655ddd8245d3190627" integrity sha512-/KPde26khDUIPkTGU82jdtTW9UAuvUTumCAbFs/7giR0SxsvZC4hru51PBvpijH6BVkHcROcvZM/lpy5h1jRRA== +"@img/sharp-darwin-arm64@0.33.5": + version "0.33.5" + resolved "https://registry.yarnpkg.com/@img/sharp-darwin-arm64/-/sharp-darwin-arm64-0.33.5.tgz#ef5b5a07862805f1e8145a377c8ba6e98813ca08" + integrity sha512-UT4p+iz/2H4twwAoLCqfA9UH5pI6DggwKEGuaPy7nCVQ8ZsiY5PIcrRvD1DzuY3qYL07NtIQcWnBSY/heikIFQ== + optionalDependencies: + "@img/sharp-libvips-darwin-arm64" "1.0.4" + +"@img/sharp-darwin-x64@0.33.5": + version "0.33.5" + resolved "https://registry.yarnpkg.com/@img/sharp-darwin-x64/-/sharp-darwin-x64-0.33.5.tgz#e03d3451cd9e664faa72948cc70a403ea4063d61" + integrity sha512-fyHac4jIc1ANYGRDxtiqelIbdWkIuQaI84Mv45KvGRRxSAa7o7d1ZKAOBaYbnepLC1WqxfpimdeWfvqqSGwR2Q== + optionalDependencies: + "@img/sharp-libvips-darwin-x64" "1.0.4" + +"@img/sharp-libvips-darwin-arm64@1.0.4": + version "1.0.4" + resolved "https://registry.yarnpkg.com/@img/sharp-libvips-darwin-arm64/-/sharp-libvips-darwin-arm64-1.0.4.tgz#447c5026700c01a993c7804eb8af5f6e9868c07f" + integrity sha512-XblONe153h0O2zuFfTAbQYAX2JhYmDHeWikp1LM9Hul9gVPjFY427k6dFEcOL72O01QxQsWi761svJ/ev9xEDg== + +"@img/sharp-libvips-darwin-x64@1.0.4": + version "1.0.4" + resolved "https://registry.yarnpkg.com/@img/sharp-libvips-darwin-x64/-/sharp-libvips-darwin-x64-1.0.4.tgz#e0456f8f7c623f9dbfbdc77383caa72281d86062" + integrity sha512-xnGR8YuZYfJGmWPvmlunFaWJsb9T/AO2ykoP3Fz/0X5XV2aoYBPkX6xqCQvUTKKiLddarLaxpzNe+b1hjeWHAQ== + +"@img/sharp-libvips-linux-arm64@1.0.4": + version "1.0.4" + resolved "https://registry.yarnpkg.com/@img/sharp-libvips-linux-arm64/-/sharp-libvips-linux-arm64-1.0.4.tgz#979b1c66c9a91f7ff2893556ef267f90ebe51704" + integrity sha512-9B+taZ8DlyyqzZQnoeIvDVR/2F4EbMepXMc/NdVbkzsJbzkUjhXv/70GQJ7tdLA4YJgNP25zukcxpX2/SueNrA== + +"@img/sharp-libvips-linux-arm@1.0.5": + version "1.0.5" + resolved "https://registry.yarnpkg.com/@img/sharp-libvips-linux-arm/-/sharp-libvips-linux-arm-1.0.5.tgz#99f922d4e15216ec205dcb6891b721bfd2884197" + integrity sha512-gvcC4ACAOPRNATg/ov8/MnbxFDJqf/pDePbBnuBDcjsI8PssmjoKMAz4LtLaVi+OnSb5FK/yIOamqDwGmXW32g== + +"@img/sharp-libvips-linux-s390x@1.0.4": + version "1.0.4" + resolved "https://registry.yarnpkg.com/@img/sharp-libvips-linux-s390x/-/sharp-libvips-linux-s390x-1.0.4.tgz#f8a5eb1f374a082f72b3f45e2fb25b8118a8a5ce" + integrity sha512-u7Wz6ntiSSgGSGcjZ55im6uvTrOxSIS8/dgoVMoiGE9I6JAfU50yH5BoDlYA1tcuGS7g/QNtetJnxA6QEsCVTA== + +"@img/sharp-libvips-linux-x64@1.0.4": + version "1.0.4" + resolved "https://registry.yarnpkg.com/@img/sharp-libvips-linux-x64/-/sharp-libvips-linux-x64-1.0.4.tgz#d4c4619cdd157774906e15770ee119931c7ef5e0" + integrity sha512-MmWmQ3iPFZr0Iev+BAgVMb3ZyC4KeFc3jFxnNbEPas60e1cIfevbtuyf9nDGIzOaW9PdnDciJm+wFFaTlj5xYw== + +"@img/sharp-libvips-linuxmusl-arm64@1.0.4": + version "1.0.4" + resolved "https://registry.yarnpkg.com/@img/sharp-libvips-linuxmusl-arm64/-/sharp-libvips-linuxmusl-arm64-1.0.4.tgz#166778da0f48dd2bded1fa3033cee6b588f0d5d5" + integrity sha512-9Ti+BbTYDcsbp4wfYib8Ctm1ilkugkA/uscUn6UXK1ldpC1JjiXbLfFZtRlBhjPZ5o1NCLiDbg8fhUPKStHoTA== + +"@img/sharp-libvips-linuxmusl-x64@1.0.4": + version "1.0.4" + resolved "https://registry.yarnpkg.com/@img/sharp-libvips-linuxmusl-x64/-/sharp-libvips-linuxmusl-x64-1.0.4.tgz#93794e4d7720b077fcad3e02982f2f1c246751ff" + integrity sha512-viYN1KX9m+/hGkJtvYYp+CCLgnJXwiQB39damAO7WMdKWlIhmYTfHjwSbQeUK/20vY154mwezd9HflVFM1wVSw== + +"@img/sharp-linux-arm64@0.33.5": + version "0.33.5" + resolved "https://registry.yarnpkg.com/@img/sharp-linux-arm64/-/sharp-linux-arm64-0.33.5.tgz#edb0697e7a8279c9fc829a60fc35644c4839bb22" + integrity sha512-JMVv+AMRyGOHtO1RFBiJy/MBsgz0x4AWrT6QoEVVTyh1E39TrCUpTRI7mx9VksGX4awWASxqCYLCV4wBZHAYxA== + optionalDependencies: + "@img/sharp-libvips-linux-arm64" "1.0.4" + +"@img/sharp-linux-arm@0.33.5": + version "0.33.5" + resolved "https://registry.yarnpkg.com/@img/sharp-linux-arm/-/sharp-linux-arm-0.33.5.tgz#422c1a352e7b5832842577dc51602bcd5b6f5eff" + integrity sha512-JTS1eldqZbJxjvKaAkxhZmBqPRGmxgu+qFKSInv8moZ2AmT5Yib3EQ1c6gp493HvrvV8QgdOXdyaIBrhvFhBMQ== + optionalDependencies: + "@img/sharp-libvips-linux-arm" "1.0.5" + +"@img/sharp-linux-s390x@0.33.5": + version "0.33.5" + resolved "https://registry.yarnpkg.com/@img/sharp-linux-s390x/-/sharp-linux-s390x-0.33.5.tgz#f5c077926b48e97e4a04d004dfaf175972059667" + integrity sha512-y/5PCd+mP4CA/sPDKl2961b+C9d+vPAveS33s6Z3zfASk2j5upL6fXVPZi7ztePZ5CuH+1kW8JtvxgbuXHRa4Q== + optionalDependencies: + "@img/sharp-libvips-linux-s390x" "1.0.4" + +"@img/sharp-linux-x64@0.33.5": + version "0.33.5" + resolved "https://registry.yarnpkg.com/@img/sharp-linux-x64/-/sharp-linux-x64-0.33.5.tgz#d806e0afd71ae6775cc87f0da8f2d03a7c2209cb" + integrity sha512-opC+Ok5pRNAzuvq1AG0ar+1owsu842/Ab+4qvU879ippJBHvyY5n2mxF1izXqkPYlGuP/M556uh53jRLJmzTWA== + optionalDependencies: + "@img/sharp-libvips-linux-x64" "1.0.4" + +"@img/sharp-linuxmusl-arm64@0.33.5": + version "0.33.5" + resolved "https://registry.yarnpkg.com/@img/sharp-linuxmusl-arm64/-/sharp-linuxmusl-arm64-0.33.5.tgz#252975b915894fb315af5deea174651e208d3d6b" + integrity sha512-XrHMZwGQGvJg2V/oRSUfSAfjfPxO+4DkiRh6p2AFjLQztWUuY/o8Mq0eMQVIY7HJ1CDQUJlxGGZRw1a5bqmd1g== + optionalDependencies: + "@img/sharp-libvips-linuxmusl-arm64" "1.0.4" + +"@img/sharp-linuxmusl-x64@0.33.5": + version "0.33.5" + resolved "https://registry.yarnpkg.com/@img/sharp-linuxmusl-x64/-/sharp-linuxmusl-x64-0.33.5.tgz#3f4609ac5d8ef8ec7dadee80b560961a60fd4f48" + integrity sha512-WT+d/cgqKkkKySYmqoZ8y3pxx7lx9vVejxW/W4DOFMYVSkErR+w7mf2u8m/y4+xHe7yY9DAXQMWQhpnMuFfScw== + optionalDependencies: + "@img/sharp-libvips-linuxmusl-x64" "1.0.4" + +"@img/sharp-wasm32@0.33.5": + version "0.33.5" + resolved "https://registry.yarnpkg.com/@img/sharp-wasm32/-/sharp-wasm32-0.33.5.tgz#6f44f3283069d935bb5ca5813153572f3e6f61a1" + integrity sha512-ykUW4LVGaMcU9lu9thv85CbRMAwfeadCJHRsg2GmeRa/cJxsVY9Rbd57JcMxBkKHag5U/x7TSBpScF4U8ElVzg== + dependencies: + "@emnapi/runtime" "^1.2.0" + +"@img/sharp-win32-ia32@0.33.5": + version "0.33.5" + resolved "https://registry.yarnpkg.com/@img/sharp-win32-ia32/-/sharp-win32-ia32-0.33.5.tgz#1a0c839a40c5351e9885628c85f2e5dfd02b52a9" + integrity sha512-T36PblLaTwuVJ/zw/LaH0PdZkRz5rd3SmMHX8GSmR7vtNSP5Z6bQkExdSK7xGWyxLw4sUknBuugTelgw2faBbQ== + +"@img/sharp-win32-x64@0.33.5": + version "0.33.5" + resolved "https://registry.yarnpkg.com/@img/sharp-win32-x64/-/sharp-win32-x64-0.33.5.tgz#56f00962ff0c4e0eb93d34a047d29fa995e3e342" + integrity sha512-MpY/o8/8kj+EcnxwvrP4aTJSWw/aZ7JIGR4aBeZkZw5B7/Jn+tY9/VNwtcoGmdT7GfggGIU4kygOMSbYnOrAbg== + "@langchain/community@^0.2.23": version "0.2.23" resolved "https://registry.yarnpkg.com/@langchain/community/-/community-0.2.23.tgz#20560e107bcc8432c42e499f1b9292d41a3732f2" @@ -1038,7 +1158,7 @@ destroy@1.2.0: resolved "https://registry.yarnpkg.com/destroy/-/destroy-1.2.0.tgz#4803735509ad8be552934c67df614f94e66fa015" integrity sha512-2sJGJTaXIIaR1w4iJSNoN0hnMY7Gpc/n8D4qSCJw8QqFWXf7cuAgnEHxBpweaVcPevC2l3KpjYCx3NypQQgaJg== -detect-libc@^2.0.0, detect-libc@^2.0.2: +detect-libc@^2.0.0, detect-libc@^2.0.2, detect-libc@^2.0.3: version "2.0.3" resolved "https://registry.yarnpkg.com/detect-libc/-/detect-libc-2.0.3.tgz#f0cd503b40f9939b894697d19ad50895e30cf700" integrity sha512-bwy0MGW55bG41VqxxypOsdSdGqLwXPI/focwgTYCFMbdUiBAxLg9CFzG08sz2aqzknwiX7Hkl0bQENjg8iLByw== @@ -3035,6 +3155,35 @@ sharp@^0.32.0: tar-fs "^3.0.4" tunnel-agent "^0.6.0" +sharp@^0.33.5: + version "0.33.5" + resolved "https://registry.yarnpkg.com/sharp/-/sharp-0.33.5.tgz#13e0e4130cc309d6a9497596715240b2ec0c594e" + integrity sha512-haPVm1EkS9pgvHrQ/F3Xy+hgcuMV0Wm9vfIBSiwZ05k+xgb0PkBQpGsAA/oWdDobNaZTH5ppvHtzCFbnSEwHVw== + dependencies: + color "^4.2.3" + detect-libc "^2.0.3" + semver "^7.6.3" + optionalDependencies: + "@img/sharp-darwin-arm64" "0.33.5" + "@img/sharp-darwin-x64" "0.33.5" + "@img/sharp-libvips-darwin-arm64" "1.0.4" + "@img/sharp-libvips-darwin-x64" "1.0.4" + "@img/sharp-libvips-linux-arm" "1.0.5" + "@img/sharp-libvips-linux-arm64" "1.0.4" + "@img/sharp-libvips-linux-s390x" "1.0.4" + "@img/sharp-libvips-linux-x64" "1.0.4" + "@img/sharp-libvips-linuxmusl-arm64" "1.0.4" + "@img/sharp-libvips-linuxmusl-x64" "1.0.4" + "@img/sharp-linux-arm" "0.33.5" + "@img/sharp-linux-arm64" "0.33.5" + "@img/sharp-linux-s390x" "0.33.5" + "@img/sharp-linux-x64" "0.33.5" + "@img/sharp-linuxmusl-arm64" "0.33.5" + "@img/sharp-linuxmusl-x64" "0.33.5" + "@img/sharp-wasm32" "0.33.5" + "@img/sharp-win32-ia32" "0.33.5" + "@img/sharp-win32-x64" "0.33.5" + side-channel@^1.0.4: version "1.0.6" resolved "https://registry.yarnpkg.com/side-channel/-/side-channel-1.0.6.tgz#abd25fb7cd24baf45466406b1096b7831c9215f2" @@ -3343,6 +3492,11 @@ tslib@>=2, tslib@^2.0.1, tslib@^2.5.0, tslib@^2.6.2: resolved "https://registry.yarnpkg.com/tslib/-/tslib-2.6.2.tgz#703ac29425e7b37cd6fd456e92404d46d1f3e4ae" integrity sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q== +tslib@^2.4.0: + version "2.8.1" + resolved "https://registry.yarnpkg.com/tslib/-/tslib-2.8.1.tgz#612efe4ed235d567e8aba5f2a5fab70280ade83f" + integrity sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w== + tunnel-agent@^0.6.0: version "0.6.0" resolved "https://registry.yarnpkg.com/tunnel-agent/-/tunnel-agent-0.6.0.tgz#27a5dea06b36b04a0a9966774b290868f0fc40fd"