From ecf4295537a5d7e23e4337815caa064e7ca65e48 Mon Sep 17 00:00:00 2001 From: Timothy Carambat <rambat1010@gmail.com> Date: Mon, 18 Dec 2023 17:17:26 -0800 Subject: [PATCH] Add ability to grab youtube transcripts via doc processor (#470) * Add ability to grab youtube transcripts via doc processor * dynamic imports swap out Github for Youtube in placeholder text --- collector/extensions/index.js | 19 +++ collector/package.json | 4 +- .../extensions/YoutubeTranscript/index.js | 95 +++++++++++++++ collector/yarn.lock | 54 ++++++++- .../components/DataConnectorOption/index.jsx | 8 ++ .../DataConnectorOption/media/index.js | 4 + .../DataConnectorOption/media/youtube.png | Bin 0 -> 5412 bytes .../Documents/Directory/FileRow/index.jsx | 10 +- .../Documents/Directory/FolderRow/index.jsx | 3 +- .../Documents/Directory/index.jsx | 3 +- .../WorkspaceFileRow/index.jsx | 3 +- .../Documents/WorkspaceDirectory/index.jsx | 6 +- frontend/src/models/dataConnector.js | 18 +++ .../Connectors/Youtube/index.jsx | 114 ++++++++++++++++++ .../DataConnectors/Connectors/index.jsx | 2 + .../GeneralSettings/DataConnectors/index.jsx | 1 + frontend/src/utils/directories.js | 2 +- frontend/src/utils/paths.js | 3 + server/endpoints/extensions/index.js | 21 ++++ 19 files changed, 353 insertions(+), 17 deletions(-) create mode 100644 collector/utils/extensions/YoutubeTranscript/index.js create mode 100644 frontend/src/components/DataConnectorOption/media/youtube.png create mode 100644 frontend/src/pages/GeneralSettings/DataConnectors/Connectors/Youtube/index.jsx diff --git a/collector/extensions/index.js b/collector/extensions/index.js index 7b131b646..bcf2229f2 100644 --- a/collector/extensions/index.js +++ b/collector/extensions/index.js @@ -47,6 +47,25 @@ function extensions(app) { } return; }); + + app.post("/ext/youtube-transcript", async function (request, response) { + try { + const loadYouTubeTranscript = require("../utils/extensions/YoutubeTranscript"); + const { success, reason, data } = await loadYouTubeTranscript(reqBody(request)); + response.status(200).json({ success, reason, data }); + } catch (e) { + console.error(e); + response.status(400).json({ + success: false, + reason: e.message, + data: { + title: null, + author: null + } + }); + } + return; + }); } module.exports = extensions; diff --git a/collector/package.json b/collector/package.json index fb9bed67a..0e81b72a5 100644 --- a/collector/package.json +++ b/collector/package.json @@ -38,7 +38,9 @@ "slugify": "^1.6.6", "url-pattern": "^1.0.3", "uuid": "^9.0.0", - "wavefile": "^11.0.0" + "wavefile": "^11.0.0", + "youtube-transcript": "^1.0.6", + "youtubei.js": "^8.0.0" }, "devDependencies": { "nodemon": "^2.0.22", diff --git a/collector/utils/extensions/YoutubeTranscript/index.js b/collector/utils/extensions/YoutubeTranscript/index.js new file mode 100644 index 000000000..7e88bb7a0 --- /dev/null +++ b/collector/utils/extensions/YoutubeTranscript/index.js @@ -0,0 +1,95 @@ +const { YoutubeLoader } = require("langchain/document_loaders/web/youtube"); +const fs = require("fs"); +const path = require("path"); +const { default: slugify } = require("slugify"); +const { v4 } = require("uuid"); +const { writeToServerDocuments } = require("../../files"); +const { tokenizeString } = require("../../tokenizer"); + +function validYoutubeVideoUrl(url) { + const UrlPattern = require("url-pattern"); + + const shortPatternMatch = new UrlPattern( + "https\\://youtu.be/(:videoId)" + ).match(url); + const fullPatternMatch = new UrlPattern( + "https\\://(www.)youtube.com/watch?v=(:videoId)" + ).match(url); + const videoId = + shortPatternMatch?.videoId || fullPatternMatch?.videoId || null; + if (!!videoId) return true; + + return false; +} + +async function loadYouTubeTranscript({ url }) { + if (!validYoutubeVideoUrl(url)) { + return { + success: false, + reason: "Invalid URL. Should be youtu.be or youtube.com/watch.", + }; + } + + console.log(`-- Working YouTube ${url} --`); + const loader = YoutubeLoader.createFromUrl(url, { addVideoInfo: true }); + const docs = await loader.load(); + + if (!docs.length) { + return { + success: false, + reason: "No transcript found for that YouTube video.", + }; + } + + const metadata = docs[0].metadata; + let content = ""; + docs.forEach((doc) => (content = content.concat(doc.pageContent))); + + if (!content.length) { + return { + success: false, + reason: "No transcript could be parsed for that YouTube video.", + }; + } + + const outFolder = slugify( + `${metadata.author} YouTube transcripts` + ).toLowerCase(); + const outFolderPath = path.resolve( + __dirname, + `../../../../server/storage/documents/${outFolder}` + ); + if (!fs.existsSync(outFolderPath)) fs.mkdirSync(outFolderPath); + + const data = { + id: v4(), + url: url + ".youtube", + title: metadata.title || url, + docAuthor: metadata.author, + description: metadata.description, + docSource: url, + chunkSource: url, + published: new Date().toLocaleString(), + wordCount: content.split(" ").length, + pageContent: content, + token_count_estimate: tokenizeString(content).length, + }; + + console.log(`[YouTube Loader]: Saving ${metadata.title} to ${outFolder}`); + writeToServerDocuments( + data, + `${slugify(metadata.title)}-${data.id}`, + outFolderPath + ); + + return { + success: true, + reason: "test", + data: { + title: metadata.title, + author: metadata.author, + }, + }; +} + +module.exports = loadYouTubeTranscript; diff --git a/collector/yarn.lock b/collector/yarn.lock index 28c610926..6501aac95 100644 --- a/collector/yarn.lock +++ b/collector/yarn.lock @@ -39,6 +39,11 @@ chalk "^2.4.2" js-tokens "^4.0.0" +"@fastify/busboy@^2.0.0": + version "2.1.0" + resolved "https://registry.yarnpkg.com/@fastify/busboy/-/busboy-2.1.0.tgz#0709e9f4cb252351c609c6e6d8d6779a8d25edff" + integrity sha512-+KpH+QxZU7O4675t3mnkQKcZZg56u+K/Ct2K+N2AZYNVK8kyeo/bI18tI8aPm3tvNNRyTWfj6s5tnGNlcbQRsA== + "@googleapis/youtube@^9.0.0": version "9.0.0" resolved "https://registry.yarnpkg.com/@googleapis/youtube/-/youtube-9.0.0.tgz#e45f6f5f7eac198c6391782b94b3ca54bacf0b63" @@ -252,6 +257,11 @@ accepts@~1.3.8: mime-types "~2.1.34" negotiator "0.6.3" +acorn@^8.8.0: + version "8.11.2" + resolved "https://registry.yarnpkg.com/acorn/-/acorn-8.11.2.tgz#ca0d78b51895be5390a5903c5b3bdcdaf78ae40b" + integrity sha512-nc0Axzp/0FILLEVsm4fNwLCwMttvhEI263QtVPQcbpfZZ3ts0hLsZGOpE6czNlid7CJ9MlyH8reXkpsf3YUY4w== + agent-base@6: version "6.0.2" resolved "https://registry.yarnpkg.com/agent-base/-/agent-base-6.0.2.tgz#49fff58577cfee3f37176feab4c22e00f86d7f77" @@ -554,6 +564,11 @@ camelcase@6: resolved "https://registry.yarnpkg.com/camelcase/-/camelcase-6.3.0.tgz#5685b95eb209ac9c0c177467778c9c84df58ba9a" integrity sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA== +centra@^2.6.0: + version "2.6.0" + resolved "https://registry.yarnpkg.com/centra/-/centra-2.6.0.tgz#79117998ee6908642258db263871381aa5d1204a" + integrity sha512-dgh+YleemrT8u85QL11Z6tYhegAs3MMxsaWAq/oXeAmYJ7VxL3SI9TZtnfaEvNDMAPolj25FXIb3S+HCI4wQaQ== + chalk@^2.4.2: version "2.4.2" resolved "https://registry.yarnpkg.com/chalk/-/chalk-2.4.2.tgz#cd42541677a54333cf541a49108c1432b44c9424" @@ -1655,6 +1670,13 @@ isexe@^2.0.0: resolved "https://registry.yarnpkg.com/isexe/-/isexe-2.0.0.tgz#e8fbf374dc556ff8947a10dcb0572d633f2cfa10" integrity sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw== +jintr@^1.1.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/jintr/-/jintr-1.1.0.tgz#223a3b07f5e03d410cec6e715c537c8ad1e714c3" + integrity sha512-Tu9wk3BpN2v+kb8yT6YBtue+/nbjeLFv4vvVC4PJ7oCidHKbifWhvORrAbQfxVIQZG+67am/mDagpiGSVtvrZg== + dependencies: + acorn "^8.8.0" + js-tiktoken@^1.0.7: version "1.0.7" resolved "https://registry.yarnpkg.com/js-tiktoken/-/js-tiktoken-1.0.7.tgz#56933fcd2093e8304060dfde3071bda91812e6f5" @@ -2431,6 +2453,13 @@ pend@~1.2.0: resolved "https://registry.yarnpkg.com/pend/-/pend-1.2.0.tgz#7a57eb550a6783f9115331fcf4663d5c8e007a50" integrity sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg== +phin@^3.5.0: + version "3.7.0" + resolved "https://registry.yarnpkg.com/phin/-/phin-3.7.0.tgz#eeeff7660408515d8cf0c6252901012d4ab7153b" + integrity sha512-DqnVNrpYhKGBZppNKprD+UJylMeEKOZxHgPB+ZP6mGzf3uA2uox4Ep9tUm+rUc8WLIdHT3HcAE4X8fhwQA9JKg== + dependencies: + centra "^2.6.0" + picomatch@^2.0.4, picomatch@^2.2.1: version "2.3.1" resolved "https://registry.yarnpkg.com/picomatch/-/picomatch-2.3.1.tgz#3ba3833733646d9d3e4995946c1365a67fb07a42" @@ -3069,7 +3098,7 @@ tr46@~0.0.3: resolved "https://registry.yarnpkg.com/tr46/-/tr46-0.0.3.tgz#8184fd347dac9cdc185992f3a6622e14b9d9ab6a" integrity sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw== -tslib@^2.0.1: +tslib@^2.0.1, tslib@^2.5.0: version "2.6.2" resolved "https://registry.yarnpkg.com/tslib/-/tslib-2.6.2.tgz#703ac29425e7b37cd6fd456e92404d46d1f3e4ae" integrity sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q== @@ -3122,6 +3151,13 @@ undici-types@~5.26.4: resolved "https://registry.yarnpkg.com/undici-types/-/undici-types-5.26.5.tgz#bcd539893d00b56e964fd2657a4866b221a65617" integrity sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA== +undici@^5.19.1: + version "5.28.2" + resolved "https://registry.yarnpkg.com/undici/-/undici-5.28.2.tgz#fea200eac65fc7ecaff80a023d1a0543423b4c91" + integrity sha512-wh1pHJHnUeQV5Xa8/kyQhO7WFa8M34l026L5P/+2TYiakvGy5Rdc8jWZVyG7ieht/0WgJLEd3kcU5gKx+6GC8w== + dependencies: + "@fastify/busboy" "^2.0.0" + universalify@^0.1.0: version "0.1.2" resolved "https://registry.yarnpkg.com/universalify/-/universalify-0.1.2.tgz#b646f69be3942dabcecc9d6639c80dc105efaa66" @@ -3279,6 +3315,22 @@ yauzl@^2.10.0, yauzl@^2.4.2: buffer-crc32 "~0.2.3" fd-slicer "~1.1.0" +youtube-transcript@^1.0.6: + version "1.0.6" + resolved "https://registry.yarnpkg.com/youtube-transcript/-/youtube-transcript-1.0.6.tgz#8414c04380d3ef1102bd00ca3729e94c46ae7a14" + integrity sha512-k/6uxB9voj/5astl6+q+VArX/aWHhnmle8BucvUCTYTQQEOSVlBiXkrI0KD3o8A0b44MV6q0bmVNiJFIpTlcZA== + dependencies: + phin "^3.5.0" + +youtubei.js@^8.0.0: + version "8.0.0" + resolved "https://registry.yarnpkg.com/youtubei.js/-/youtubei.js-8.0.0.tgz#0fcbe332e263d9be6afe4e3d1917e9ddc1ffbed3" + integrity sha512-kUwHvqoB5vfaGaY1quAGcX5JPIyjr5fjj9Zj/ZwUDCrermz/r5uIkNiJ5cNHkmAJbZP9fdygzNMvGHd7fM445g== + dependencies: + jintr "^1.1.0" + tslib "^2.5.0" + undici "^5.19.1" + zod-to-json-schema@3.20.3: version "3.20.3" resolved "https://registry.yarnpkg.com/zod-to-json-schema/-/zod-to-json-schema-3.20.3.tgz#8c95d8c20f20455ffa0b4b526c29703f35f6d787" diff --git a/frontend/src/components/DataConnectorOption/index.jsx b/frontend/src/components/DataConnectorOption/index.jsx index 84af0ff1e..df7fad0f6 100644 --- a/frontend/src/components/DataConnectorOption/index.jsx +++ b/frontend/src/components/DataConnectorOption/index.jsx @@ -36,4 +36,12 @@ export const DATA_CONNECTORS = { "Import an entire public or private Github repository in a single click.", link: "https://github.com", }, + "youtube-transcript": { + name: "YouTube Transcript", + path: paths.settings.dataConnectors.youtubeTranscript(), + image: ConnectorImages.youtube, + description: + "Import the transcription of an entire YouTube video from a link.", + link: "https://youtube.com", + }, }; diff --git a/frontend/src/components/DataConnectorOption/media/index.js b/frontend/src/components/DataConnectorOption/media/index.js index a339328ef..b3bacc1de 100644 --- a/frontend/src/components/DataConnectorOption/media/index.js +++ b/frontend/src/components/DataConnectorOption/media/index.js @@ -1,5 +1,9 @@ import Github from "./github.png"; +import YouTube from "./youtube.png"; + const ConnectorImages = { github: Github, + youtube: YouTube, }; + export default ConnectorImages; diff --git a/frontend/src/components/DataConnectorOption/media/youtube.png b/frontend/src/components/DataConnectorOption/media/youtube.png new file mode 100644 index 0000000000000000000000000000000000000000..aed2b0475cafadf03769b671baeeeaa58581a68f GIT binary patch literal 5412 zcmds5cUV(tvrm*RMG*l(>4-==M0yaUW7HLdAR-A#P#`3M1PCaC6hTFbh#=}(2)Igw zNKup`p~S)>5+ztDg1Q(1L5g4m`A*zjKXvcDf82Zj`p)ybPtKWl=67a(GiTm;a%B5f zHzfrv1rP|N<l*kT0|Wv?Buh>Pki;cqMFT(bba#IS2&CL9S>Sv#Wo<xc%XGmpH`5{_ z$W$hX6=g%Radvbyck{<Odeb~?(2y`2l!cWA(!$ajWq~%|h_tjoqpcuJ8<aT`g|tQ> zpo1f&Adu8Vs1J^b!)~!5(js645-pevV@1#bd=SWvWdqzskeLKMRs@C0uwmJoeATc4 z?j>Tl2}VKhugflq6k@Z(+4V0~;K|-(ACpP9fx}~CV_~u8Fj~}JIMUkM8je80Q79;& z0cFHdnFJP;$}j|weuQ!+Gl)^4bY>`xswaUa1k<9K_9iCZwElF7MEv#|dUO=!>q|&P zIGIA0@WOy2VaOl6BoS?xp-c+-C&wKr%%3EbP)Q^<6asaxy$K6SB8L#7DNGa0PU=^C zk{*f72;EE7gX+ZrjztHP?cfqODWTute3kqN4gW73_^S3@y#Gro9GT3h(BN-$P#G}7 zS4p&lLl}@=IDsS~1n~nQe_Gi%k{I7PE{VjO9Ql(3phU7kCqyyGK5=xiy~z$TgBBe{ zB!6EAe*tceA`_To5(b4pnM08%DAL*v{?pdCRNDQu;lg5ikTAhkmPj-ifrKJZRv}P} zjU+VGYGbfDGz4LeKoQJ?tx*W{H>^KR{(<Bg3;>}~Xw*iuHPQlUX^pl*e6#tw`41Bx zS~PK=BrFW-o94e||5pER()=S`e~0=bt^ZGnrhoTsNwWO)qm|?r*Z}ft{zHBNnT=yK za~~}V;}aU8x0w({hzKSy?Ig;7Z~PS8|GgA}w(&z9e)Et;p#P%+0cD%7)k2H%q0uOq z%>-%;!4Cej_<wTp?<Vw<4$zLiD`EJzc81}1R>gooDtR8xjy|ljV||esy?f8y{jjZX z`AQn8ppSym-&|?$=X;fxVIKgOZh2N$hU5I!GsV5&gj@ynm5OL{m@j5syP;WOQtCFS zum|~E9pa;f@bnx(hk1I3g{PFAJLw-Iz@7@E;(Lq<<Jo)DSNHUFkYfM%gc%r~J;<xf z!#*GGIT<%Ke{kyD!0zj{wYBR(^p{wnEC_VY00-*T`wv`CNeQ60*N<tiuk2#gBjDJc z0Efh4p_su?3jNp{^rdRX+-%X-Ph3?{mUyPRrh0Nbf0nbbH#VzUN5zlbh)rD3^dDNJ zJ}9~#Dl@bg!sc&b+4aa}Ygb(xSY!m`l#b|ccd4#XGcfUZQDCqUcWNUoSDfBFULe!# zx42ORE%6<FU|5=-RTC&-dtbuxw64q!A8e7ehR(L0mRUY!F3#N3yJk70^=6!xTrd9C zs;V7{a=YV<oZnwtIyl_1+(03@Tj}x4M0BH$lljg7@v~<WRU#Yk1Am6wO$N!7Y1>?p zZg%awlAo`GwCi0tVTnZuJAP*o=SHvvdPSpUqHD&ZBiFp@YR2*>8tN~vsu}cr+HgG} zU?s08+<oOMeauW4Yq!H=@yza)$+z1rI{KbI%w1kRnNc0i6!jRUsq8`~cNfa*Up<~a zUU8mk-~YBOaMPBK!ZDrjHV#Uwu|S5CT|PXyOgL&SdW{t2uM~vCy%MclPHtCRc)`9o zo>zY0GPAoz(A{%$546KshgWpfW1Bt1ekzqIX3uniM4ENGY$hT_=c)2_=EDbMM*xtn z%>sr)6-HGrVSJH04UbJdSKy6FnQ4v`U8K3g7aoY+L5UyV7f>4l;yb}3hPNn<$>_j> z#?LMy(cNp=QWN7#bqgBj&kyE%n9NI!Xt=RIJLvBDu=LT|60pe3AqO+(F=8lk7xQ)% z7U<M&jvsOyF$7OWKXw#-+<d<?u4FBTpE+Hlealz6l^c79IiBay$tcn0Oht>lWSetL zlB{B~JqB!HN+ofDW*ypdE6kesBbQqk;|Kc0?!G7cmBY*R69#QU=Uo8Uqv6wig4y%f z_sT0pW8C+RX*jTuAEemX*2-SRF`P}i*#qLv{=Rb1`g9I+tNbF}Q`|ccNi)?mPB_C} z6i2YUqn8Mt5(4Y>5<dF|OS9Kq9}dyt#ZT&srl*8%V7&3=^D3QPQcusTZVX|csdx>w z-1I;u*4;tNGO_#7!StE8`hXEtOi|T%84-(HHYMmV9E303^c>L1?%CqN5yqs?{J!nT zs+Vok{-QZzX*yn;;|zNZew*`wA%x+ohS}cR&OlZ0Yq#1*h%droc;W$*ywhW#M;C$5 zDM%L`&=14G*H{K<HOZW{3%OAp$lVh!q=bUBCmzUGl|%rQle_NAzDkQErTB&FXQON^ z_hEflB_s)kj8}dk6+VCRekJI3>sxEeN>0x+0=T^4qir79+E|MIDeTp{_UBUVzdZ!q zW+N`A0<J%Co+g8tUDN(~ehRN-k3DwOQ<3R>Gsi39TF**9BFn&kT>$}&Z03S$f64G} zz8^Mi44zKn{gJX(J!uBXa-h%SyE~nOR5z(We)saO`OS;3q+VyH2hXh_wkZio=yy?| zc*x{BBP!Q`rVF0NHE2cOs^uM>*Ok3+*WiwP(OTaX3Gc#7bq^oOBsH`Yaa;qH%dA-0 z(=7vD<Dd7A7#@oCmkqzI6K&18h}zH>>mz>XEhxX~ud7sFWDoB^UxMB>C3FQcQl?w{ zN;((ne6+Y2EtZA<@f$w%_zpCiPpN)GU%fHn1of8r*>(LBp(DkIELT_>d`KqGeGtBV zwB27UZV{;bnpb|CFW&dk<rEKaAwWTU`eArUiwV2#OMq4{A;H2uuyxt^hvYWt*hI|q zX?nsjq#Vb%eXaWCR7^v367UdFsTE<Goc0hQbEw<dZvRmYh^y9Qlcv<Eb`;Khlv~Nu zlzNU1X=%7_kf8xd=|pVKM#>y2b+6;Fctcy2J_Rc-p|#$6y%LVgy^)D%)pU+@iT?#U zv_|-BpBHaAae3Rxo!r`kJMA(Z)x6lIpyagvV8u(C2KIMfNgc~Q986R+Nc5LB8&Yb_ zLGbk^^SjGZ>!Gq^nko0Tv9a>Dkf#Ji16_ZmEL8~SFfzlAG$UNIRX)-*({WBc^dahf z`gTx3vu0}0iWvXM<hE;My0vD?glTg$$BK8GrT+kRPm#DBci7vBosE>!HTXTaP2QJ% z5k<;EW>mXa<qe%&-iVONEvLY4FYV)NI>%Sudi${B^CL6BS<kch5BgW%8uRrOQoNmB zry%75R{Q~#SD$RrOg&E7V=M&$EY~Kd)gxtc^WDdd&t_CAayRhxDw|%eKbnTj=sz++ zxPMml5=%3Mehh!H7WhfWgEJj#9l1&C_<ASYxJfaYjz-7v;IiAzVh~sH^PUTVMA!J& zI-+|EnXim8@L;T@EwVIXHT)|EJv$5Za82zdY}rOgLsYZa{lKvaCuyEjB(|rdCDy}D z_Xb9lnSsB^g6+x4>NT;CqSb~-TMkK;RcyP>LO$5hRCF-G$A(#FfutrFeY*dLmXPjp z_56`aHSVS7EV*u4)-GL-#@fTul;uPPe|_9N4WdHi3~l6-s?@Sige^S2aEHW!d$L4@ zXUA4@cqom&F565~Heo-kLY3ODN?xr77>Zu*bi%I!#3pCLalML4ea(eVw=3iF5zV)t z3Q5Q`=FE(X|KV#Yg1d!IVuO098;}4~;udgw>wdWYBU8TGylJ-cSB2)n<ZZbRFAZ^2 z2fLSA-7QR}<?g`&FP58HO+SiEvm@2C7~pX7M1@cH-tBbSD&LibOq)L`a|zO-yzB#_ zG^c`-jBZjcd!*@oZvb!o02P_M>Pn+K;9^@&xAHOkDiz=kb@H*Y*9oBUJ>FXkl{L&X zRrB_}oGK8c#U|&Y`D#Z)r_L1jvCa2z51R>Eb&H|t8JgbfZ@AZ)nTBv{v+{NMYI<ZX zh^zJ_uW-q8@@fbbnJ3Fv<L++laIeRI__$_3)4M6mfa)rzF60+3ff={sUF<qt3J|49 zTS;+B;lu9LC0xf$)0a`C)sPsR2;gC4#Rua2kaYNz`jsbebVxDuh%&M?Y3AKXJ2Txl z?HEJK44P?*H#EpL0~!?GKYr}ZZ(hLw-xstE66*cDd-0Kj_DU-K_Y0TYHXNEUyARO& zQRh~VST5(-JzI{YOCWSIaZQqrAu!Tq91yBm;&MA<fyPF*co#Lw1#yCems5dLX7bnH zyq3Jp9SrQkTxv9>a*aa$dZ9*1I!G(xQC&#L^(4E|IJaqIpgZGqQ-)EzC0vZuvLx*^ zxl_D2c#_WgRy3z0pt=1bXnw=^k&mIBV?(lDC&~{LNIE;|X2<ChSgsXR>VUbV;s4pr zx7%{M64tdZd8=AH477F-Ea>7^d9S5{_sF~wFy+zv>l14>;#WdEZok`A1qhM`89gxE zxHbL;aE#Q7<Eu)>l4oWu533NQUb+RHXLcxsw7-y+%x>dJ&n7`a!oZ=4UUaYq0Pw;0 z@f+~&b+3j;R~-YckTUr1UY6NF1#~EbH>H0)1TPJ!-#Ry@3?xBR`eKU;AjU*r4&@k) ztUYA}wB<x?a)C6PLQkL0yLx}E*7IlY0fNFk#G{tivX?aN!hFU~;XF)7xkjQPY0-`F zlE<9^y7EbH`2A*UF@%FQ>r1#TCz^v={d9w2%iOU+0|LfljPH&Oyc3eqNV%Vwpay!r zw%)|fa%C>xlu%cZ=o9}@exw*g@52gi9t=xROVn?W91rl_CdcpWOEB6LdAu<|{H1Ep z<$5Zc;rX<>#wcNU{mt`W&Zy3^PF`!Uafv3xLu-TAfbG`mq^B|2X~X^7pGoGD5H5Q{ zBW5nOI`+;?E~t$go5*@-tYv2U2JbOget5usUBc@Nz((BXdyXQ9tTTNc)!w2&W>5z_ z%YAlW9XbRUtnRs82@*dbh*Hj1V@_nv$)7ShSK+17tG_ULSLn86ZdPS+L*K#mBJZX* z2c*kHX|DvE2Xgj^^8wT^qkZtc_?zu7PJUMOeSDuNC{AY&9}EHF5tr9_h@Wa;$L}}( z(wNdc?TQtDT6Djw`%3A_rn>91p0XJIg;!9)$D=b{6r2t(%yK_$w0@7Z{(>l0FlQS2 zv3}+ka6^xS8YcG>ZD1AV=*YVf?D(DjWuK(OjgPOsvJr-bBy^VvPW!dYUIB<*YSZIj zG^Ulmp;+)fHuY{tkO}6*Bc(Pu;V$(*O74ec_)V1oCf&`%_o+t<Jr@LYmkrtOwr)<D z4-5y(P(`m_kb@0y1(gWhA+&#l3+1YM`57!U0jYg>>-0=baCL*Kj^&-ALmQf4p@wq4 zmcXR(c*gfh>BwOC$K7-OQ^+2{<P*g%y!8PkXabGDkGp0`{<9gQWm0B0fQj8&qr43L zN?0gAu=iowi;*W~gSLmNx)isE>K%v(itoJxZM~crFc;>U5%`ORlv#9@*NXkc>hBYR zDh`i(*9c=xM_;p-H>Q0m2@G<qxD&B(Mo0~*Ri%!<xPy)Ny0YrxwGv;7=Td!VTE9R% z<*v%xkQ~y=_MJPmSZ{yOY=JTMrJku!zoH|F!?v63NxZQT%-HW!bkY9{gLDKBrW=mE mXa{Z5PL^&``VU;k7iBYJ6jUy3x5|<H0qNng)w$dWpZqWTsR3pH literal 0 HcmV?d00001 diff --git a/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/FileRow/index.jsx b/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/FileRow/index.jsx index f83a9e34c..cd695dfcf 100644 --- a/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/FileRow/index.jsx +++ b/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/FileRow/index.jsx @@ -60,16 +60,19 @@ export default function FileRow({ selected ? "bg-sky-500/20" : "" } ${expanded ? "bg-sky-500/10" : ""}`}`} > - <div className="pl-4 col-span-4 flex gap-x-[4px] items-center"> + <div className="pl-2 col-span-6 flex gap-x-[4px] items-center"> <div - className="w-3 h-3 rounded border-[1px] border-white flex justify-center items-center cursor-pointer" + className="shrink-0 w-3 h-3 rounded border-[1px] border-white flex justify-center items-center cursor-pointer" role="checkbox" aria-checked={selected} tabIndex={0} > {selected && <div className="w-2 h-2 bg-white rounded-[2px]" />} </div> - <File className="text-base font-bold w-4 h-4 mr-[3px]" weight="fill" /> + <File + className="shrink-0 text-base font-bold w-4 h-4 mr-[3px]" + weight="fill" + /> <div className="relative" onMouseEnter={handleMouseEnter} @@ -88,7 +91,6 @@ export default function FileRow({ <p className="col-span-2 pl-3.5 whitespace-nowrap"> {formatDate(item?.published)} </p> - <p className="col-span-2 pl-3">{item?.size || "---"}</p> <p className="col-span-2 pl-2 uppercase">{getFileExtension(item.url)}</p> <div className="col-span-2 flex justify-end items-center"> {item?.cached && ( diff --git a/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/FolderRow/index.jsx b/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/FolderRow/index.jsx index c93a45cd3..5b7f1be39 100644 --- a/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/FolderRow/index.jsx +++ b/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/FolderRow/index.jsx @@ -53,7 +53,7 @@ export default function FolderRow({ selected ? "bg-sky-500/20" : "" }`} > - <div className="col-span-4 flex gap-x-[4px] items-center"> + <div className="col-span-6 flex gap-x-[4px] items-center"> <div className="shrink-0 w-3 h-3 rounded border-[1px] border-white flex justify-center items-center cursor-pointer" role="checkbox" @@ -79,7 +79,6 @@ export default function FolderRow({ </p> </div> <p className="col-span-2 pl-3.5" /> - <p className="col-span-2 pl-3" /> <p className="col-span-2 pl-2" /> <div className="col-span-2 flex justify-end items-center"> {item.name !== "custom-documents" && ( diff --git a/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/index.jsx b/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/index.jsx index dcf625c5e..1dd83de9a 100644 --- a/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/index.jsx +++ b/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/index.jsx @@ -71,9 +71,8 @@ export default function Directory({ <div className="relative w-[560px] h-[310px] bg-zinc-900 rounded-2xl"> <div className="rounded-t-2xl text-white/80 text-xs grid grid-cols-12 py-2 px-8 border-b border-white/20 shadow-lg bg-zinc-900 sticky top-0 z-10"> - <p className="col-span-4">Name</p> + <p className="col-span-6">Name</p> <p className="col-span-2">Date</p> - <p className="col-span-2">Size</p> <p className="col-span-2">Kind</p> <p className="col-span-2">Cached</p> </div> diff --git a/frontend/src/components/Modals/MangeWorkspace/Documents/WorkspaceDirectory/WorkspaceFileRow/index.jsx b/frontend/src/components/Modals/MangeWorkspace/Documents/WorkspaceDirectory/WorkspaceFileRow/index.jsx index da75ec02f..ceb751558 100644 --- a/frontend/src/components/Modals/MangeWorkspace/Documents/WorkspaceDirectory/WorkspaceFileRow/index.jsx +++ b/frontend/src/components/Modals/MangeWorkspace/Documents/WorkspaceDirectory/WorkspaceFileRow/index.jsx @@ -54,7 +54,7 @@ export default function WorkspaceFileRow({ className={`items-center transition-all duration-200 text-white/80 text-xs grid grid-cols-12 py-2 pl-3.5 pr-8 border-b border-white/20 hover:bg-sky-500/20 cursor-pointer ${isMovedItem ? "bg-green-800/40" : ""}`} > - <div className="col-span-4 flex gap-x-[4px] items-center"> + <div className="col-span-6 flex gap-x-[4px] items-center"> <File className="text-base font-bold w-4 h-4 ml-3 mr-[3px]" weight="fill" @@ -77,7 +77,6 @@ export default function WorkspaceFileRow({ <p className="col-span-2 pl-3.5 whitespace-nowrap"> {formatDate(item?.published)} </p> - <p className="col-span-2 pl-3">{item?.size || "---"}</p> <p className="col-span-2 pl-2 uppercase">{getFileExtension(item.url)}</p> <div className="col-span-2 flex justify-end items-center"> {item?.cached && ( diff --git a/frontend/src/components/Modals/MangeWorkspace/Documents/WorkspaceDirectory/index.jsx b/frontend/src/components/Modals/MangeWorkspace/Documents/WorkspaceDirectory/index.jsx index 9969e844e..e1ec21dd4 100644 --- a/frontend/src/components/Modals/MangeWorkspace/Documents/WorkspaceDirectory/index.jsx +++ b/frontend/src/components/Modals/MangeWorkspace/Documents/WorkspaceDirectory/index.jsx @@ -26,9 +26,8 @@ export default function WorkspaceDirectory({ </div> <div className="relative w-[560px] h-[445px] bg-zinc-900 rounded-2xl mt-5"> <div className="text-white/80 text-xs grid grid-cols-12 py-2 px-8 border-b border-white/20"> - <p className="col-span-4">Name</p> + <p className="col-span-6">Name</p> <p className="col-span-2">Date</p> - <p className="col-span-2">Size</p> <p className="col-span-2">Kind</p> <p className="col-span-2">Cached</p> </div> @@ -56,9 +55,8 @@ export default function WorkspaceDirectory({ }`} > <div className="text-white/80 text-xs grid grid-cols-12 py-2 px-8 border-b border-white/20 bg-zinc-900 sticky top-0 z-10"> - <p className="col-span-4">Name</p> + <p className="col-span-6">Name</p> <p className="col-span-2">Date</p> - <p className="col-span-2">Size</p> <p className="col-span-2">Kind</p> <p className="col-span-2">Cached</p> </div> diff --git a/frontend/src/models/dataConnector.js b/frontend/src/models/dataConnector.js index 45d575024..e0b3c0c3e 100644 --- a/frontend/src/models/dataConnector.js +++ b/frontend/src/models/dataConnector.js @@ -42,6 +42,24 @@ const DataConnector = { }); }, }, + youtube: { + transcribe: async ({ url }) => { + return await fetch(`${API_BASE}/ext/youtube/transcript`, { + method: "POST", + headers: baseHeaders(), + body: JSON.stringify({ url }), + }) + .then((res) => res.json()) + .then((res) => { + if (!res.success) throw new Error(res.reason); + return { data: res.data, error: null }; + }) + .catch((e) => { + console.error(e); + return { data: null, error: e.message }; + }); + }, + }, }; export default DataConnector; diff --git a/frontend/src/pages/GeneralSettings/DataConnectors/Connectors/Youtube/index.jsx b/frontend/src/pages/GeneralSettings/DataConnectors/Connectors/Youtube/index.jsx new file mode 100644 index 000000000..5252e3fd2 --- /dev/null +++ b/frontend/src/pages/GeneralSettings/DataConnectors/Connectors/Youtube/index.jsx @@ -0,0 +1,114 @@ +import React, { useState } from "react"; +import Sidebar, { SidebarMobileHeader } from "@/components/SettingsSidebar"; +import { isMobile } from "react-device-detect"; +import { DATA_CONNECTORS } from "@/components/DataConnectorOption"; +import System from "@/models/system"; +import showToast from "@/utils/toast"; + +export default function YouTubeTranscriptConnectorSetup() { + const { image } = DATA_CONNECTORS["youtube-transcript"]; + const [loading, setLoading] = useState(false); + const handleSubmit = async (e) => { + e.preventDefault(); + const form = new FormData(e.target); + + try { + setLoading(true); + showToast("Fetching transcript for YouTube video.", "info", { + clear: true, + autoClose: false, + }); + const { data, error } = await System.dataConnectors.youtube.transcribe({ + url: form.get("url"), + }); + + if (!!error) { + showToast(error, "error", { clear: true }); + setLoading(false); + return; + } + + showToast( + `${data.title} by ${data.author} transcription completed. Output folder is ${data.destination}.`, + "success", + { clear: true } + ); + e.target.reset(); + setLoading(false); + return; + } catch (e) { + console.error(e); + showToast(e.message, "error", { clear: true }); + setLoading(false); + } + }; + + return ( + <div className="w-screen h-screen overflow-hidden bg-sidebar flex"> + {!isMobile && <Sidebar />} + <div + style={{ height: isMobile ? "100%" : "calc(100% - 32px)" }} + className="relative md:ml-[2px] md:mr-[16px] md:my-[16px] md:rounded-[26px] bg-main-gradient w-full h-full overflow-y-scroll border-4 border-accent" + > + {isMobile && <SidebarMobileHeader />} + <div className="flex w-full"> + <div className="flex flex-col w-full px-1 md:px-20 md:py-12 py-16"> + <div className="flex w-full gap-x-4 items-center pb-6 border-white border-b-2 border-opacity-10"> + <img src={image} alt="YouTube" className="rounded-lg h-16 w-16" /> + <div className="w-full flex flex-col gap-y-1"> + <div className="items-center flex gap-x-4"> + <p className="text-2xl font-semibold text-white"> + Import YouTube transcription + </p> + </div> + <p className="text-sm font-base text-white text-opacity-60"> + From a youtube link, import the entire transcript of that + video for embedding. + </p> + </div> + </div> + + <form className="w-full" onSubmit={handleSubmit}> + <div className="w-full flex flex-col py-2"> + <div className="w-full flex items-center gap-4"> + <div className="flex flex-col w-60"> + <div className="flex flex-col gap-y-1 mb-4"> + <label className="text-white text-sm font-semibold block"> + YouTube video URL + </label> + </div> + <input + type="url" + name="url" + className="bg-zinc-900 text-white placeholder-white placeholder-opacity-60 text-sm rounded-lg focus:border-white block w-full p-2.5" + placeholder="https://youtube.com/watch?v=abc123" + required={true} + autoComplete="off" + spellCheck={false} + /> + </div> + </div> + </div> + + <div className="flex flex-col gap-y-2 w-fit"> + <button + type="submit" + disabled={loading} + className="mt-2 text-lg w-fit border border-slate-200 px-4 py-1 rounded-lg text-slate-200 items-center flex gap-x-2 hover:bg-slate-200 hover:text-slate-800 disabled:bg-slate-200 disabled:text-slate-800" + > + {loading ? "Collecting transcript..." : "Collect transcript"} + </button> + {loading && ( + <p className="text-xs text-zinc-300"> + Once complete, the transcription will be available for + embedding into workspaces in the document picker. + </p> + )} + </div> + </form> + </div> + </div> + </div> + </div> + ); +} diff --git a/frontend/src/pages/GeneralSettings/DataConnectors/Connectors/index.jsx b/frontend/src/pages/GeneralSettings/DataConnectors/Connectors/index.jsx index cbd66f08a..edb6aae07 100644 --- a/frontend/src/pages/GeneralSettings/DataConnectors/Connectors/index.jsx +++ b/frontend/src/pages/GeneralSettings/DataConnectors/Connectors/index.jsx @@ -2,9 +2,11 @@ import paths from "@/utils/paths"; import { lazy } from "react"; import { useParams } from "react-router-dom"; const Github = lazy(() => import("./Github")); +const YouTubeTranscript = lazy(() => import("./Youtube")); const CONNECTORS = { github: Github, + "youtube-transcript": YouTubeTranscript, }; export default function DataConnectorSetup() { diff --git a/frontend/src/pages/GeneralSettings/DataConnectors/index.jsx b/frontend/src/pages/GeneralSettings/DataConnectors/index.jsx index 76dc13d0a..50f4298ed 100644 --- a/frontend/src/pages/GeneralSettings/DataConnectors/index.jsx +++ b/frontend/src/pages/GeneralSettings/DataConnectors/index.jsx @@ -29,6 +29,7 @@ export default function DataConnectors() { </div> <div className="py-4 w-full flex md:flex-wrap overflow-x-scroll gap-4 max-w-full"> <DataConnectorOption slug="github" /> + <DataConnectorOption slug="youtube-transcript" /> </div> </div> </div> diff --git a/frontend/src/utils/directories.js b/frontend/src/utils/directories.js index b2a1d493f..5a65b5336 100644 --- a/frontend/src/utils/directories.js +++ b/frontend/src/utils/directories.js @@ -13,7 +13,7 @@ export function getFileExtension(path) { export function middleTruncate(str, n) { const fileExtensionPattern = /([^.]*)$/; - const extensionMatch = str.match(fileExtensionPattern); + const extensionMatch = str.includes(".") && str.match(fileExtensionPattern); if (str.length <= n) return str; diff --git a/frontend/src/utils/paths.js b/frontend/src/utils/paths.js index c21c1500b..2812878fb 100644 --- a/frontend/src/utils/paths.js +++ b/frontend/src/utils/paths.js @@ -83,6 +83,9 @@ export default { github: () => { return "/settings/data-connectors/github"; }, + youtubeTranscript: () => { + return "/settings/data-connectors/youtube-transcript"; + }, }, }, }; diff --git a/server/endpoints/extensions/index.js b/server/endpoints/extensions/index.js index fc545ce3c..1b3770374 100644 --- a/server/endpoints/extensions/index.js +++ b/server/endpoints/extensions/index.js @@ -48,6 +48,27 @@ function extensionEndpoints(app) { } } ); + + app.post( + "/ext/youtube/transcript", + [validatedRequest, flexUserRoleValid], + async (request, response) => { + try { + const responseFromProcessor = await forwardExtensionRequest({ + endpoint: "/ext/youtube-transcript", + method: "POST", + body: request.body, + }); + await Telemetry.sendTelemetry("extension_invoked", { + type: "youtube_transcript", + }); + response.status(200).json(responseFromProcessor); + } catch (e) { + console.error(e); + response.sendStatus(500).end(); + } + } + ); } module.exports = { extensionEndpoints }; -- GitLab