From ecf4295537a5d7e23e4337815caa064e7ca65e48 Mon Sep 17 00:00:00 2001
From: Timothy Carambat <rambat1010@gmail.com>
Date: Mon, 18 Dec 2023 17:17:26 -0800
Subject: [PATCH] Add ability to grab youtube transcripts via doc processor
 (#470)

* Add ability to grab youtube transcripts via doc processor

* dynamic imports
swap out Github for Youtube in placeholder text
---
 collector/extensions/index.js                 |  19 +++
 collector/package.json                        |   4 +-
 .../extensions/YoutubeTranscript/index.js     |  95 +++++++++++++++
 collector/yarn.lock                           |  54 ++++++++-
 .../components/DataConnectorOption/index.jsx  |   8 ++
 .../DataConnectorOption/media/index.js        |   4 +
 .../DataConnectorOption/media/youtube.png     | Bin 0 -> 5412 bytes
 .../Documents/Directory/FileRow/index.jsx     |  10 +-
 .../Documents/Directory/FolderRow/index.jsx   |   3 +-
 .../Documents/Directory/index.jsx             |   3 +-
 .../WorkspaceFileRow/index.jsx                |   3 +-
 .../Documents/WorkspaceDirectory/index.jsx    |   6 +-
 frontend/src/models/dataConnector.js          |  18 +++
 .../Connectors/Youtube/index.jsx              | 114 ++++++++++++++++++
 .../DataConnectors/Connectors/index.jsx       |   2 +
 .../GeneralSettings/DataConnectors/index.jsx  |   1 +
 frontend/src/utils/directories.js             |   2 +-
 frontend/src/utils/paths.js                   |   3 +
 server/endpoints/extensions/index.js          |  21 ++++
 19 files changed, 353 insertions(+), 17 deletions(-)
 create mode 100644 collector/utils/extensions/YoutubeTranscript/index.js
 create mode 100644 frontend/src/components/DataConnectorOption/media/youtube.png
 create mode 100644 frontend/src/pages/GeneralSettings/DataConnectors/Connectors/Youtube/index.jsx

diff --git a/collector/extensions/index.js b/collector/extensions/index.js
index 7b131b646..bcf2229f2 100644
--- a/collector/extensions/index.js
+++ b/collector/extensions/index.js
@@ -47,6 +47,25 @@ function extensions(app) {
     }
     return;
   });
+
+  app.post("/ext/youtube-transcript", async function (request, response) {
+    try {
+      const loadYouTubeTranscript = require("../utils/extensions/YoutubeTranscript");
+      const { success, reason, data } = await loadYouTubeTranscript(reqBody(request));
+      response.status(200).json({ success, reason, data });
+    } catch (e) {
+      console.error(e);
+      response.status(400).json({
+        success: false,
+        reason: e.message,
+        data: {
+          title: null,
+          author: null
+        }
+      });
+    }
+    return;
+  });
 }
 
 module.exports = extensions;
diff --git a/collector/package.json b/collector/package.json
index fb9bed67a..0e81b72a5 100644
--- a/collector/package.json
+++ b/collector/package.json
@@ -38,7 +38,9 @@
     "slugify": "^1.6.6",
     "url-pattern": "^1.0.3",
     "uuid": "^9.0.0",
-    "wavefile": "^11.0.0"
+    "wavefile": "^11.0.0",
+    "youtube-transcript": "^1.0.6",
+    "youtubei.js": "^8.0.0"
   },
   "devDependencies": {
     "nodemon": "^2.0.22",
diff --git a/collector/utils/extensions/YoutubeTranscript/index.js b/collector/utils/extensions/YoutubeTranscript/index.js
new file mode 100644
index 000000000..7e88bb7a0
--- /dev/null
+++ b/collector/utils/extensions/YoutubeTranscript/index.js
@@ -0,0 +1,95 @@
+const { YoutubeLoader } = require("langchain/document_loaders/web/youtube");
+const fs = require("fs");
+const path = require("path");
+const { default: slugify } = require("slugify");
+const { v4 } = require("uuid");
+const { writeToServerDocuments } = require("../../files");
+const { tokenizeString } = require("../../tokenizer");
+
+function validYoutubeVideoUrl(url) {
+  const UrlPattern = require("url-pattern");
+
+  const shortPatternMatch = new UrlPattern(
+    "https\\://youtu.be/(:videoId)"
+  ).match(url);
+  const fullPatternMatch = new UrlPattern(
+    "https\\://(www.)youtube.com/watch?v=(:videoId)"
+  ).match(url);
+  const videoId =
+    shortPatternMatch?.videoId || fullPatternMatch?.videoId || null;
+  if (!!videoId) return true;
+
+  return false;
+}
+
+async function loadYouTubeTranscript({ url }) {
+  if (!validYoutubeVideoUrl(url)) {
+    return {
+      success: false,
+      reason: "Invalid URL. Should be youtu.be or youtube.com/watch.",
+    };
+  }
+
+  console.log(`-- Working YouTube ${url} --`);
+  const loader = YoutubeLoader.createFromUrl(url, { addVideoInfo: true });
+  const docs = await loader.load();
+
+  if (!docs.length) {
+    return {
+      success: false,
+      reason: "No transcript found for that YouTube video.",
+    };
+  }
+
+  const metadata = docs[0].metadata;
+  let content = "";
+  docs.forEach((doc) => (content = content.concat(doc.pageContent)));
+
+  if (!content.length) {
+    return {
+      success: false,
+      reason: "No transcript could be parsed for that YouTube video.",
+    };
+  }
+
+  const outFolder = slugify(
+    `${metadata.author} YouTube transcripts`
+  ).toLowerCase();
+  const outFolderPath = path.resolve(
+    __dirname,
+    `../../../../server/storage/documents/${outFolder}`
+  );
+  if (!fs.existsSync(outFolderPath)) fs.mkdirSync(outFolderPath);
+
+  const data = {
+    id: v4(),
+    url: url + ".youtube",
+    title: metadata.title || url,
+    docAuthor: metadata.author,
+    description: metadata.description,
+    docSource: url,
+    chunkSource: url,
+    published: new Date().toLocaleString(),
+    wordCount: content.split(" ").length,
+    pageContent: content,
+    token_count_estimate: tokenizeString(content).length,
+  };
+
+  console.log(`[YouTube Loader]: Saving ${metadata.title} to ${outFolder}`);
+  writeToServerDocuments(
+    data,
+    `${slugify(metadata.title)}-${data.id}`,
+    outFolderPath
+  );
+
+  return {
+    success: true,
+    reason: "test",
+    data: {
+      title: metadata.title,
+      author: metadata.author,
+    },
+  };
+}
+
+module.exports = loadYouTubeTranscript;
diff --git a/collector/yarn.lock b/collector/yarn.lock
index 28c610926..6501aac95 100644
--- a/collector/yarn.lock
+++ b/collector/yarn.lock
@@ -39,6 +39,11 @@
     chalk "^2.4.2"
     js-tokens "^4.0.0"
 
+"@fastify/busboy@^2.0.0":
+  version "2.1.0"
+  resolved "https://registry.yarnpkg.com/@fastify/busboy/-/busboy-2.1.0.tgz#0709e9f4cb252351c609c6e6d8d6779a8d25edff"
+  integrity sha512-+KpH+QxZU7O4675t3mnkQKcZZg56u+K/Ct2K+N2AZYNVK8kyeo/bI18tI8aPm3tvNNRyTWfj6s5tnGNlcbQRsA==
+
 "@googleapis/youtube@^9.0.0":
   version "9.0.0"
   resolved "https://registry.yarnpkg.com/@googleapis/youtube/-/youtube-9.0.0.tgz#e45f6f5f7eac198c6391782b94b3ca54bacf0b63"
@@ -252,6 +257,11 @@ accepts@~1.3.8:
     mime-types "~2.1.34"
     negotiator "0.6.3"
 
+acorn@^8.8.0:
+  version "8.11.2"
+  resolved "https://registry.yarnpkg.com/acorn/-/acorn-8.11.2.tgz#ca0d78b51895be5390a5903c5b3bdcdaf78ae40b"
+  integrity sha512-nc0Axzp/0FILLEVsm4fNwLCwMttvhEI263QtVPQcbpfZZ3ts0hLsZGOpE6czNlid7CJ9MlyH8reXkpsf3YUY4w==
+
 agent-base@6:
   version "6.0.2"
   resolved "https://registry.yarnpkg.com/agent-base/-/agent-base-6.0.2.tgz#49fff58577cfee3f37176feab4c22e00f86d7f77"
@@ -554,6 +564,11 @@ camelcase@6:
   resolved "https://registry.yarnpkg.com/camelcase/-/camelcase-6.3.0.tgz#5685b95eb209ac9c0c177467778c9c84df58ba9a"
   integrity sha512-Gmy6FhYlCY7uOElZUSbxo2UCDH8owEk996gkbrpsgGtrJLM3J7jGxl9Ic7Qwwj4ivOE5AWZWRMecDdF7hqGjFA==
 
+centra@^2.6.0:
+  version "2.6.0"
+  resolved "https://registry.yarnpkg.com/centra/-/centra-2.6.0.tgz#79117998ee6908642258db263871381aa5d1204a"
+  integrity sha512-dgh+YleemrT8u85QL11Z6tYhegAs3MMxsaWAq/oXeAmYJ7VxL3SI9TZtnfaEvNDMAPolj25FXIb3S+HCI4wQaQ==
+
 chalk@^2.4.2:
   version "2.4.2"
   resolved "https://registry.yarnpkg.com/chalk/-/chalk-2.4.2.tgz#cd42541677a54333cf541a49108c1432b44c9424"
@@ -1655,6 +1670,13 @@ isexe@^2.0.0:
   resolved "https://registry.yarnpkg.com/isexe/-/isexe-2.0.0.tgz#e8fbf374dc556ff8947a10dcb0572d633f2cfa10"
   integrity sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==
 
+jintr@^1.1.0:
+  version "1.1.0"
+  resolved "https://registry.yarnpkg.com/jintr/-/jintr-1.1.0.tgz#223a3b07f5e03d410cec6e715c537c8ad1e714c3"
+  integrity sha512-Tu9wk3BpN2v+kb8yT6YBtue+/nbjeLFv4vvVC4PJ7oCidHKbifWhvORrAbQfxVIQZG+67am/mDagpiGSVtvrZg==
+  dependencies:
+    acorn "^8.8.0"
+
 js-tiktoken@^1.0.7:
   version "1.0.7"
   resolved "https://registry.yarnpkg.com/js-tiktoken/-/js-tiktoken-1.0.7.tgz#56933fcd2093e8304060dfde3071bda91812e6f5"
@@ -2431,6 +2453,13 @@ pend@~1.2.0:
   resolved "https://registry.yarnpkg.com/pend/-/pend-1.2.0.tgz#7a57eb550a6783f9115331fcf4663d5c8e007a50"
   integrity sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==
 
+phin@^3.5.0:
+  version "3.7.0"
+  resolved "https://registry.yarnpkg.com/phin/-/phin-3.7.0.tgz#eeeff7660408515d8cf0c6252901012d4ab7153b"
+  integrity sha512-DqnVNrpYhKGBZppNKprD+UJylMeEKOZxHgPB+ZP6mGzf3uA2uox4Ep9tUm+rUc8WLIdHT3HcAE4X8fhwQA9JKg==
+  dependencies:
+    centra "^2.6.0"
+
 picomatch@^2.0.4, picomatch@^2.2.1:
   version "2.3.1"
   resolved "https://registry.yarnpkg.com/picomatch/-/picomatch-2.3.1.tgz#3ba3833733646d9d3e4995946c1365a67fb07a42"
@@ -3069,7 +3098,7 @@ tr46@~0.0.3:
   resolved "https://registry.yarnpkg.com/tr46/-/tr46-0.0.3.tgz#8184fd347dac9cdc185992f3a6622e14b9d9ab6a"
   integrity sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==
 
-tslib@^2.0.1:
+tslib@^2.0.1, tslib@^2.5.0:
   version "2.6.2"
   resolved "https://registry.yarnpkg.com/tslib/-/tslib-2.6.2.tgz#703ac29425e7b37cd6fd456e92404d46d1f3e4ae"
   integrity sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q==
@@ -3122,6 +3151,13 @@ undici-types@~5.26.4:
   resolved "https://registry.yarnpkg.com/undici-types/-/undici-types-5.26.5.tgz#bcd539893d00b56e964fd2657a4866b221a65617"
   integrity sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==
 
+undici@^5.19.1:
+  version "5.28.2"
+  resolved "https://registry.yarnpkg.com/undici/-/undici-5.28.2.tgz#fea200eac65fc7ecaff80a023d1a0543423b4c91"
+  integrity sha512-wh1pHJHnUeQV5Xa8/kyQhO7WFa8M34l026L5P/+2TYiakvGy5Rdc8jWZVyG7ieht/0WgJLEd3kcU5gKx+6GC8w==
+  dependencies:
+    "@fastify/busboy" "^2.0.0"
+
 universalify@^0.1.0:
   version "0.1.2"
   resolved "https://registry.yarnpkg.com/universalify/-/universalify-0.1.2.tgz#b646f69be3942dabcecc9d6639c80dc105efaa66"
@@ -3279,6 +3315,22 @@ yauzl@^2.10.0, yauzl@^2.4.2:
     buffer-crc32 "~0.2.3"
     fd-slicer "~1.1.0"
 
+youtube-transcript@^1.0.6:
+  version "1.0.6"
+  resolved "https://registry.yarnpkg.com/youtube-transcript/-/youtube-transcript-1.0.6.tgz#8414c04380d3ef1102bd00ca3729e94c46ae7a14"
+  integrity sha512-k/6uxB9voj/5astl6+q+VArX/aWHhnmle8BucvUCTYTQQEOSVlBiXkrI0KD3o8A0b44MV6q0bmVNiJFIpTlcZA==
+  dependencies:
+    phin "^3.5.0"
+
+youtubei.js@^8.0.0:
+  version "8.0.0"
+  resolved "https://registry.yarnpkg.com/youtubei.js/-/youtubei.js-8.0.0.tgz#0fcbe332e263d9be6afe4e3d1917e9ddc1ffbed3"
+  integrity sha512-kUwHvqoB5vfaGaY1quAGcX5JPIyjr5fjj9Zj/ZwUDCrermz/r5uIkNiJ5cNHkmAJbZP9fdygzNMvGHd7fM445g==
+  dependencies:
+    jintr "^1.1.0"
+    tslib "^2.5.0"
+    undici "^5.19.1"
+
 zod-to-json-schema@3.20.3:
   version "3.20.3"
   resolved "https://registry.yarnpkg.com/zod-to-json-schema/-/zod-to-json-schema-3.20.3.tgz#8c95d8c20f20455ffa0b4b526c29703f35f6d787"
diff --git a/frontend/src/components/DataConnectorOption/index.jsx b/frontend/src/components/DataConnectorOption/index.jsx
index 84af0ff1e..df7fad0f6 100644
--- a/frontend/src/components/DataConnectorOption/index.jsx
+++ b/frontend/src/components/DataConnectorOption/index.jsx
@@ -36,4 +36,12 @@ export const DATA_CONNECTORS = {
       "Import an entire public or private Github repository in a single click.",
     link: "https://github.com",
   },
+  "youtube-transcript": {
+    name: "YouTube Transcript",
+    path: paths.settings.dataConnectors.youtubeTranscript(),
+    image: ConnectorImages.youtube,
+    description:
+      "Import the transcription of an entire YouTube video from a link.",
+    link: "https://youtube.com",
+  },
 };
diff --git a/frontend/src/components/DataConnectorOption/media/index.js b/frontend/src/components/DataConnectorOption/media/index.js
index a339328ef..b3bacc1de 100644
--- a/frontend/src/components/DataConnectorOption/media/index.js
+++ b/frontend/src/components/DataConnectorOption/media/index.js
@@ -1,5 +1,9 @@
 import Github from "./github.png";
+import YouTube from "./youtube.png";
+
 const ConnectorImages = {
   github: Github,
+  youtube: YouTube,
 };
+
 export default ConnectorImages;
diff --git a/frontend/src/components/DataConnectorOption/media/youtube.png b/frontend/src/components/DataConnectorOption/media/youtube.png
new file mode 100644
index 0000000000000000000000000000000000000000..aed2b0475cafadf03769b671baeeeaa58581a68f
GIT binary patch
literal 5412
zcmds5cUV(tvrm*RMG*l(>4-==M0yaUW7HLdAR-A#P#`3M1PCaC6hTFbh#=}(2)Igw
zNKup`p~S)>5+ztDg1Q(1L5g4m`A*zjKXvcDf82Zj`p)ybPtKWl=67a(GiTm;a%B5f
zHzfrv1rP|N<l*kT0|Wv?Buh>Pki;cqMFT(bba#IS2&CL9S>Sv#Wo<xc%XGmpH`5{_
z$W$hX6=g%Radvbyck{<Odeb~?(2y`2l!cWA(!$ajWq~%|h_tjoqpcuJ8<aT`g|tQ>
zpo1f&Adu8Vs1J^b!)~!5(js645-pevV@1#bd=SWvWdqzskeLKMRs@C0uwmJoeATc4
z?j>Tl2}VKhugflq6k@Z(+4V0~;K|-(ACpP9fx}~CV_~u8Fj~}JIMUkM8je80Q79;&
z0cFHdnFJP;$}j|weuQ!+Gl)^4bY>`xswaUa1k<9K_9iCZwElF7MEv#|dUO=!>q|&P
zIGIA0@WOy2VaOl6BoS?xp-c+-C&wKr%%3EbP)Q^<6asaxy$K6SB8L#7DNGa0PU=^C
zk{*f72;EE7gX+ZrjztHP?cfqODWTute3kqN4gW73_^S3@y#Gro9GT3h(BN-$P#G}7
zS4p&lLl}@=IDsS~1n~nQe_Gi%k{I7PE{VjO9Ql(3phU7kCqyyGK5=xiy~z$TgBBe{
zB!6EAe*tceA`_To5(b4pnM08%DAL*v{?pdCRNDQu;lg5ikTAhkmPj-ifrKJZRv}P}
zjU+VGYGbfDGz4LeKoQJ?tx*W{H>^KR{(<Bg3;>}~Xw*iuHPQlUX^pl*e6#tw`41Bx
zS~PK=BrFW-o94e||5pER()=S`e~0=bt^ZGnrhoTsNwWO)qm|?r*Z}ft{zHBNnT=yK
za~~}V;}aU8x0w({hzKSy?Ig;7Z~PS8|GgA}w(&z9e)Et;p#P%+0cD%7)k2H%q0uOq
z%>-%;!4Cej_<wTp?<Vw<4$zLiD`EJzc81}1R>gooDtR8xjy|ljV||esy?f8y{jjZX
z`AQn8ppSym-&|?$=X;fxVIKgOZh2N$hU5I!GsV5&gj@ynm5OL{m@j5syP;WOQtCFS
zum|~E9pa;f@bnx(hk1I3g{PFAJLw-Iz@7@E;(Lq<<Jo)DSNHUFkYfM%gc%r~J;<xf
z!#*GGIT<%Ke{kyD!0zj{wYBR(^p{wnEC_VY00-*T`wv`CNeQ60*N<tiuk2#gBjDJc
z0Efh4p_su?3jNp{^rdRX+-%X-Ph3?{mUyPRrh0Nbf0nbbH#VzUN5zlbh)rD3^dDNJ
zJ}9~#Dl@bg!sc&b+4aa}Ygb(xSY!m`l#b|ccd4#XGcfUZQDCqUcWNUoSDfBFULe!#
zx42ORE%6<FU|5=-RTC&-dtbuxw64q!A8e7ehR(L0mRUY!F3#N3yJk70^=6!xTrd9C
zs;V7{a=YV<oZnwtIyl_1+(03@Tj}x4M0BH$lljg7@v~<WRU#Yk1Am6wO$N!7Y1>?p
zZg%awlAo`GwCi0tVTnZuJAP*o=SHvvdPSpUqHD&ZBiFp@YR2*>8tN~vsu}cr+HgG}
zU?s08+<oOMeauW4Yq!H=@yza)$+z1rI{KbI%w1kRnNc0i6!jRUsq8`~cNfa*Up<~a
zUU8mk-~YBOaMPBK!ZDrjHV#Uwu|S5CT|PXyOgL&SdW{t2uM~vCy%MclPHtCRc)`9o
zo>zY0GPAoz(A{%$546KshgWpfW1Bt1ekzqIX3uniM4ENGY$hT_=c)2_=EDbMM*xtn
z%>sr)6-HGrVSJH04UbJdSKy6FnQ4v`U8K3g7aoY+L5UyV7f>4l;yb}3hPNn<$>_j>
z#?LMy(cNp=QWN7#bqgBj&kyE%n9NI!Xt=RIJLvBDu=LT|60pe3AqO+(F=8lk7xQ)%
z7U<M&jvsOyF$7OWKXw#-+<d<?u4FBTpE+Hlealz6l^c79IiBay$tcn0Oht>lWSetL
zlB{B~JqB!HN+ofDW*ypdE6kesBbQqk;|Kc0?!G7cmBY*R69#QU=Uo8Uqv6wig4y%f
z_sT0pW8C+RX*jTuAEemX*2-SRF`P}i*#qLv{=Rb1`g9I+tNbF}Q`|ccNi)?mPB_C}
z6i2YUqn8Mt5(4Y>5<dF|OS9Kq9}dyt#ZT&srl*8%V7&3=^D3QPQcusTZVX|csdx>w
z-1I;u*4;tNGO_#7!StE8`hXEtOi|T%84-(HHYMmV9E303^c>L1?%CqN5yqs?{J!nT
zs+Vok{-QZzX*yn;;|zNZew*`wA%x+ohS}cR&OlZ0Yq#1*h%droc;W$*ywhW#M;C$5
zDM%L`&=14G*H{K<HOZW{3%OAp$lVh!q=bUBCmzUGl|%rQle_NAzDkQErTB&FXQON^
z_hEflB_s)kj8}dk6+VCRekJI3>sxEeN>0x+0=T^4qir79+E|MIDeTp{_UBUVzdZ!q
zW+N`A0<J%Co+g8tUDN(~ehRN-k3DwOQ<3R>Gsi39TF**9BFn&kT>$}&Z03S$f64G}
zz8^Mi44zKn{gJX(J!uBXa-h%SyE~nOR5z(We)saO`OS;3q+VyH2hXh_wkZio=yy?|
zc*x{BBP!Q`rVF0NHE2cOs^uM>*Ok3+*WiwP(OTaX3Gc#7bq^oOBsH`Yaa;qH%dA-0
z(=7vD<Dd7A7#@oCmkqzI6K&18h}zH>>mz>XEhxX~ud7sFWDoB^UxMB>C3FQcQl?w{
zN;((ne6+Y2EtZA<@f$w%_zpCiPpN)GU%fHn1of8r*>(LBp(DkIELT_>d`KqGeGtBV
zwB27UZV{;bnpb|CFW&dk<rEKaAwWTU`eArUiwV2#OMq4{A;H2uuyxt^hvYWt*hI|q
zX?nsjq#Vb%eXaWCR7^v367UdFsTE<Goc0hQbEw<dZvRmYh^y9Qlcv<Eb`;Khlv~Nu
zlzNU1X=%7_kf8xd=|pVKM#>y2b+6;Fctcy2J_Rc-p|#$6y%LVgy^)D%)pU+@iT?#U
zv_|-BpBHaAae3Rxo!r`kJMA(Z)x6lIpyagvV8u(C2KIMfNgc~Q986R+Nc5LB8&Yb_
zLGbk^^SjGZ>!Gq^nko0Tv9a>Dkf#Ji16_ZmEL8~SFfzlAG$UNIRX)-*({WBc^dahf
z`gTx3vu0}0iWvXM<hE;My0vD?glTg$$BK8GrT+kRPm#DBci7vBosE>!HTXTaP2QJ%
z5k<;EW>mXa<qe%&-iVONEvLY4FYV)NI>%Sudi${B^CL6BS<kch5BgW%8uRrOQoNmB
zry%75R{Q~#SD$RrOg&E7V=M&$EY~Kd)gxtc^WDdd&t_CAayRhxDw|%eKbnTj=sz++
zxPMml5=%3Mehh!H7WhfWgEJj#9l1&C_<ASYxJfaYjz-7v;IiAzVh~sH^PUTVMA!J&
zI-+|EnXim8@L;T@EwVIXHT)|EJv$5Za82zdY}rOgLsYZa{lKvaCuyEjB(|rdCDy}D
z_Xb9lnSsB^g6+x4>NT;CqSb~-TMkK;RcyP>LO$5hRCF-G$A(#FfutrFeY*dLmXPjp
z_56`aHSVS7EV*u4)-GL-#@fTul;uPPe|_9N4WdHi3~l6-s?@Sige^S2aEHW!d$L4@
zXUA4@cqom&F565~Heo-kLY3ODN?xr77>Zu*bi%I!#3pCLalML4ea(eVw=3iF5zV)t
z3Q5Q`=FE(X|KV#Yg1d!IVuO098;}4~;udgw>wdWYBU8TGylJ-cSB2)n<ZZbRFAZ^2
z2fLSA-7QR}<?g`&FP58HO+SiEvm@2C7~pX7M1@cH-tBbSD&LibOq)L`a|zO-yzB#_
zG^c`-jBZjcd!*@oZvb!o02P_M>Pn+K;9^@&xAHOkDiz=kb@H*Y*9oBUJ>FXkl{L&X
zRrB_}oGK8c#U|&Y`D#Z)r_L1jvCa2z51R>Eb&H|t8JgbfZ@AZ)nTBv{v+{NMYI<ZX
zh^zJ_uW-q8@@fbbnJ3Fv<L++laIeRI__$_3)4M6mfa)rzF60+3ff={sUF<qt3J|49
zTS;+B;lu9LC0xf$)0a`C)sPsR2;gC4#Rua2kaYNz`jsbebVxDuh%&M?Y3AKXJ2Txl
z?HEJK44P?*H#EpL0~!?GKYr}ZZ(hLw-xstE66*cDd-0Kj_DU-K_Y0TYHXNEUyARO&
zQRh~VST5(-JzI{YOCWSIaZQqrAu!Tq91yBm;&MA<fyPF*co#Lw1#yCems5dLX7bnH
zyq3Jp9SrQkTxv9>a*aa$dZ9*1I!G(xQC&#L^(4E|IJaqIpgZGqQ-)EzC0vZuvLx*^
zxl_D2c#_WgRy3z0pt=1bXnw=^k&mIBV?(lDC&~{LNIE;|X2<ChSgsXR>VUbV;s4pr
zx7%{M64tdZd8=AH477F-Ea>7^d9S5{_sF~wFy+zv>l14>;#WdEZok`A1qhM`89gxE
zxHbL;aE#Q7<Eu)>l4oWu533NQUb+RHXLcxsw7-y+%x>dJ&n7`a!oZ=4UUaYq0Pw;0
z@f+~&b+3j;R~-YckTUr1UY6NF1#~EbH>H0)1TPJ!-#Ry@3?xBR`eKU;AjU*r4&@k)
ztUYA}wB<x?a)C6PLQkL0yLx}E*7IlY0fNFk#G{tivX?aN!hFU~;XF)7xkjQPY0-`F
zlE<9^y7EbH`2A*UF@%FQ>r1#TCz^v={d9w2%iOU+0|LfljPH&Oyc3eqNV%Vwpay!r
zw%)|fa%C>xlu%cZ=o9}@exw*g@52gi9t=xROVn?W91rl_CdcpWOEB6LdAu<|{H1Ep
z<$5Zc;rX<>#wcNU{mt`W&Zy3^PF`!Uafv3xLu-TAfbG`mq^B|2X~X^7pGoGD5H5Q{
zBW5nOI`+;?E~t$go5*@-tYv2U2JbOget5usUBc@Nz((BXdyXQ9tTTNc)!w2&W>5z_
z%YAlW9XbRUtnRs82@*dbh*Hj1V@_nv$)7ShSK+17tG_ULSLn86ZdPS+L*K#mBJZX*
z2c*kHX|DvE2Xgj^^8wT^qkZtc_?zu7PJUMOeSDuNC{AY&9}EHF5tr9_h@Wa;$L}}(
z(wNdc?TQtDT6Djw`%3A_rn>91p0XJIg;!9)$D=b{6r2t(%yK_$w0@7Z{(>l0FlQS2
zv3}+ka6^xS8YcG>ZD1AV=*YVf?D(DjWuK(OjgPOsvJr-bBy^VvPW!dYUIB<*YSZIj
zG^Ulmp;+)fHuY{tkO}6*Bc(Pu;V$(*O74ec_)V1oCf&`%_o+t<Jr@LYmkrtOwr)<D
z4-5y(P(`m_kb@0y1(gWhA+&#l3+1YM`57!U0jYg>>-0=baCL*Kj^&-ALmQf4p@wq4
zmcXR(c*gfh>BwOC$K7-OQ^+2{<P*g%y!8PkXabGDkGp0`{<9gQWm0B0fQj8&qr43L
zN?0gAu=iowi;*W~gSLmNx)isE>K%v(itoJxZM~crFc;>U5%`ORlv#9@*NXkc>hBYR
zDh`i(*9c=xM_;p-H>Q0m2@G<qxD&B(Mo0~*Ri%!<xPy)Ny0YrxwGv;7=Td!VTE9R%
z<*v%xkQ~y=_MJPmSZ{yOY=JTMrJku!zoH|F!?v63NxZQT%-HW!bkY9{gLDKBrW=mE
mXa{Z5PL^&``VU;k7iBYJ6jUy3x5|<H0qNng)w$dWpZqWTsR3pH

literal 0
HcmV?d00001

diff --git a/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/FileRow/index.jsx b/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/FileRow/index.jsx
index f83a9e34c..cd695dfcf 100644
--- a/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/FileRow/index.jsx
+++ b/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/FileRow/index.jsx
@@ -60,16 +60,19 @@ export default function FileRow({
         selected ? "bg-sky-500/20" : ""
       } ${expanded ? "bg-sky-500/10" : ""}`}`}
     >
-      <div className="pl-4 col-span-4 flex gap-x-[4px] items-center">
+      <div className="pl-2 col-span-6 flex gap-x-[4px] items-center">
         <div
-          className="w-3 h-3 rounded border-[1px] border-white flex justify-center items-center cursor-pointer"
+          className="shrink-0 w-3 h-3 rounded border-[1px] border-white flex justify-center items-center cursor-pointer"
           role="checkbox"
           aria-checked={selected}
           tabIndex={0}
         >
           {selected && <div className="w-2 h-2 bg-white rounded-[2px]" />}
         </div>
-        <File className="text-base font-bold w-4 h-4 mr-[3px]" weight="fill" />
+        <File
+          className="shrink-0 text-base font-bold w-4 h-4 mr-[3px]"
+          weight="fill"
+        />
         <div
           className="relative"
           onMouseEnter={handleMouseEnter}
@@ -88,7 +91,6 @@ export default function FileRow({
       <p className="col-span-2 pl-3.5 whitespace-nowrap">
         {formatDate(item?.published)}
       </p>
-      <p className="col-span-2 pl-3">{item?.size || "---"}</p>
       <p className="col-span-2 pl-2 uppercase">{getFileExtension(item.url)}</p>
       <div className="col-span-2 flex justify-end items-center">
         {item?.cached && (
diff --git a/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/FolderRow/index.jsx b/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/FolderRow/index.jsx
index c93a45cd3..5b7f1be39 100644
--- a/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/FolderRow/index.jsx
+++ b/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/FolderRow/index.jsx
@@ -53,7 +53,7 @@ export default function FolderRow({
           selected ? "bg-sky-500/20" : ""
         }`}
       >
-        <div className="col-span-4 flex gap-x-[4px] items-center">
+        <div className="col-span-6 flex gap-x-[4px] items-center">
           <div
             className="shrink-0 w-3 h-3 rounded border-[1px] border-white flex justify-center items-center cursor-pointer"
             role="checkbox"
@@ -79,7 +79,6 @@ export default function FolderRow({
           </p>
         </div>
         <p className="col-span-2 pl-3.5" />
-        <p className="col-span-2 pl-3" />
         <p className="col-span-2 pl-2" />
         <div className="col-span-2 flex justify-end items-center">
           {item.name !== "custom-documents" && (
diff --git a/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/index.jsx b/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/index.jsx
index dcf625c5e..1dd83de9a 100644
--- a/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/index.jsx
+++ b/frontend/src/components/Modals/MangeWorkspace/Documents/Directory/index.jsx
@@ -71,9 +71,8 @@ export default function Directory({
 
         <div className="relative w-[560px] h-[310px] bg-zinc-900 rounded-2xl">
           <div className="rounded-t-2xl text-white/80 text-xs grid grid-cols-12 py-2 px-8 border-b border-white/20 shadow-lg bg-zinc-900 sticky top-0 z-10">
-            <p className="col-span-4">Name</p>
+            <p className="col-span-6">Name</p>
             <p className="col-span-2">Date</p>
-            <p className="col-span-2">Size</p>
             <p className="col-span-2">Kind</p>
             <p className="col-span-2">Cached</p>
           </div>
diff --git a/frontend/src/components/Modals/MangeWorkspace/Documents/WorkspaceDirectory/WorkspaceFileRow/index.jsx b/frontend/src/components/Modals/MangeWorkspace/Documents/WorkspaceDirectory/WorkspaceFileRow/index.jsx
index da75ec02f..ceb751558 100644
--- a/frontend/src/components/Modals/MangeWorkspace/Documents/WorkspaceDirectory/WorkspaceFileRow/index.jsx
+++ b/frontend/src/components/Modals/MangeWorkspace/Documents/WorkspaceDirectory/WorkspaceFileRow/index.jsx
@@ -54,7 +54,7 @@ export default function WorkspaceFileRow({
       className={`items-center transition-all duration-200 text-white/80 text-xs grid grid-cols-12 py-2 pl-3.5 pr-8 border-b border-white/20 hover:bg-sky-500/20 cursor-pointer
           ${isMovedItem ? "bg-green-800/40" : ""}`}
     >
-      <div className="col-span-4 flex gap-x-[4px] items-center">
+      <div className="col-span-6 flex gap-x-[4px] items-center">
         <File
           className="text-base font-bold w-4 h-4 ml-3 mr-[3px]"
           weight="fill"
@@ -77,7 +77,6 @@ export default function WorkspaceFileRow({
       <p className="col-span-2 pl-3.5 whitespace-nowrap">
         {formatDate(item?.published)}
       </p>
-      <p className="col-span-2 pl-3">{item?.size || "---"}</p>
       <p className="col-span-2 pl-2 uppercase">{getFileExtension(item.url)}</p>
       <div className="col-span-2 flex justify-end items-center">
         {item?.cached && (
diff --git a/frontend/src/components/Modals/MangeWorkspace/Documents/WorkspaceDirectory/index.jsx b/frontend/src/components/Modals/MangeWorkspace/Documents/WorkspaceDirectory/index.jsx
index 9969e844e..e1ec21dd4 100644
--- a/frontend/src/components/Modals/MangeWorkspace/Documents/WorkspaceDirectory/index.jsx
+++ b/frontend/src/components/Modals/MangeWorkspace/Documents/WorkspaceDirectory/index.jsx
@@ -26,9 +26,8 @@ export default function WorkspaceDirectory({
         </div>
         <div className="relative w-[560px] h-[445px] bg-zinc-900 rounded-2xl mt-5">
           <div className="text-white/80 text-xs grid grid-cols-12 py-2 px-8 border-b border-white/20">
-            <p className="col-span-4">Name</p>
+            <p className="col-span-6">Name</p>
             <p className="col-span-2">Date</p>
-            <p className="col-span-2">Size</p>
             <p className="col-span-2">Kind</p>
             <p className="col-span-2">Cached</p>
           </div>
@@ -56,9 +55,8 @@ export default function WorkspaceDirectory({
         }`}
       >
         <div className="text-white/80 text-xs grid grid-cols-12 py-2 px-8 border-b border-white/20 bg-zinc-900 sticky top-0 z-10">
-          <p className="col-span-4">Name</p>
+          <p className="col-span-6">Name</p>
           <p className="col-span-2">Date</p>
-          <p className="col-span-2">Size</p>
           <p className="col-span-2">Kind</p>
           <p className="col-span-2">Cached</p>
         </div>
diff --git a/frontend/src/models/dataConnector.js b/frontend/src/models/dataConnector.js
index 45d575024..e0b3c0c3e 100644
--- a/frontend/src/models/dataConnector.js
+++ b/frontend/src/models/dataConnector.js
@@ -42,6 +42,24 @@ const DataConnector = {
         });
     },
   },
+  youtube: {
+    transcribe: async ({ url }) => {
+      return await fetch(`${API_BASE}/ext/youtube/transcript`, {
+        method: "POST",
+        headers: baseHeaders(),
+        body: JSON.stringify({ url }),
+      })
+        .then((res) => res.json())
+        .then((res) => {
+          if (!res.success) throw new Error(res.reason);
+          return { data: res.data, error: null };
+        })
+        .catch((e) => {
+          console.error(e);
+          return { data: null, error: e.message };
+        });
+    },
+  },
 };
 
 export default DataConnector;
diff --git a/frontend/src/pages/GeneralSettings/DataConnectors/Connectors/Youtube/index.jsx b/frontend/src/pages/GeneralSettings/DataConnectors/Connectors/Youtube/index.jsx
new file mode 100644
index 000000000..5252e3fd2
--- /dev/null
+++ b/frontend/src/pages/GeneralSettings/DataConnectors/Connectors/Youtube/index.jsx
@@ -0,0 +1,114 @@
+import React, { useState } from "react";
+import Sidebar, { SidebarMobileHeader } from "@/components/SettingsSidebar";
+import { isMobile } from "react-device-detect";
+import { DATA_CONNECTORS } from "@/components/DataConnectorOption";
+import System from "@/models/system";
+import showToast from "@/utils/toast";
+
+export default function YouTubeTranscriptConnectorSetup() {
+  const { image } = DATA_CONNECTORS["youtube-transcript"];
+  const [loading, setLoading] = useState(false);
+  const handleSubmit = async (e) => {
+    e.preventDefault();
+    const form = new FormData(e.target);
+
+    try {
+      setLoading(true);
+      showToast("Fetching transcript for YouTube video.", "info", {
+        clear: true,
+        autoClose: false,
+      });
+      const { data, error } = await System.dataConnectors.youtube.transcribe({
+        url: form.get("url"),
+      });
+
+      if (!!error) {
+        showToast(error, "error", { clear: true });
+        setLoading(false);
+        return;
+      }
+
+      showToast(
+        `${data.title} by ${data.author} transcription completed. Output folder is ${data.destination}.`,
+        "success",
+        { clear: true }
+      );
+      e.target.reset();
+      setLoading(false);
+      return;
+    } catch (e) {
+      console.error(e);
+      showToast(e.message, "error", { clear: true });
+      setLoading(false);
+    }
+  };
+
+  return (
+    <div className="w-screen h-screen overflow-hidden bg-sidebar flex">
+      {!isMobile && <Sidebar />}
+      <div
+        style={{ height: isMobile ? "100%" : "calc(100% - 32px)" }}
+        className="relative md:ml-[2px] md:mr-[16px] md:my-[16px] md:rounded-[26px] bg-main-gradient w-full h-full overflow-y-scroll border-4 border-accent"
+      >
+        {isMobile && <SidebarMobileHeader />}
+        <div className="flex w-full">
+          <div className="flex flex-col w-full px-1 md:px-20 md:py-12 py-16">
+            <div className="flex w-full gap-x-4 items-center  pb-6 border-white border-b-2 border-opacity-10">
+              <img src={image} alt="YouTube" className="rounded-lg h-16 w-16" />
+              <div className="w-full flex flex-col gap-y-1">
+                <div className="items-center flex gap-x-4">
+                  <p className="text-2xl font-semibold text-white">
+                    Import YouTube transcription
+                  </p>
+                </div>
+                <p className="text-sm font-base text-white text-opacity-60">
+                  From a youtube link, import the entire transcript of that
+                  video for embedding.
+                </p>
+              </div>
+            </div>
+
+            <form className="w-full" onSubmit={handleSubmit}>
+              <div className="w-full flex flex-col py-2">
+                <div className="w-full flex items-center gap-4">
+                  <div className="flex flex-col w-60">
+                    <div className="flex flex-col gap-y-1 mb-4">
+                      <label className="text-white text-sm font-semibold block">
+                        YouTube video URL
+                      </label>
+                    </div>
+                    <input
+                      type="url"
+                      name="url"
+                      className="bg-zinc-900 text-white placeholder-white placeholder-opacity-60 text-sm rounded-lg focus:border-white block w-full p-2.5"
+                      placeholder="https://youtube.com/watch?v=abc123"
+                      required={true}
+                      autoComplete="off"
+                      spellCheck={false}
+                    />
+                  </div>
+                </div>
+              </div>
+
+              <div className="flex flex-col gap-y-2 w-fit">
+                <button
+                  type="submit"
+                  disabled={loading}
+                  className="mt-2 text-lg w-fit border border-slate-200 px-4 py-1 rounded-lg text-slate-200 items-center flex gap-x-2 hover:bg-slate-200 hover:text-slate-800 disabled:bg-slate-200 disabled:text-slate-800"
+                >
+                  {loading ? "Collecting transcript..." : "Collect transcript"}
+                </button>
+                {loading && (
+                  <p className="text-xs text-zinc-300">
+                    Once complete, the transcription will be available for
+                    embedding into workspaces in the document picker.
+                  </p>
+                )}
+              </div>
+            </form>
+          </div>
+        </div>
+      </div>
+    </div>
+  );
+}
diff --git a/frontend/src/pages/GeneralSettings/DataConnectors/Connectors/index.jsx b/frontend/src/pages/GeneralSettings/DataConnectors/Connectors/index.jsx
index cbd66f08a..edb6aae07 100644
--- a/frontend/src/pages/GeneralSettings/DataConnectors/Connectors/index.jsx
+++ b/frontend/src/pages/GeneralSettings/DataConnectors/Connectors/index.jsx
@@ -2,9 +2,11 @@ import paths from "@/utils/paths";
 import { lazy } from "react";
 import { useParams } from "react-router-dom";
 const Github = lazy(() => import("./Github"));
+const YouTubeTranscript = lazy(() => import("./Youtube"));
 
 const CONNECTORS = {
   github: Github,
+  "youtube-transcript": YouTubeTranscript,
 };
 
 export default function DataConnectorSetup() {
diff --git a/frontend/src/pages/GeneralSettings/DataConnectors/index.jsx b/frontend/src/pages/GeneralSettings/DataConnectors/index.jsx
index 76dc13d0a..50f4298ed 100644
--- a/frontend/src/pages/GeneralSettings/DataConnectors/index.jsx
+++ b/frontend/src/pages/GeneralSettings/DataConnectors/index.jsx
@@ -29,6 +29,7 @@ export default function DataConnectors() {
             </div>
             <div className="py-4 w-full flex md:flex-wrap overflow-x-scroll gap-4 max-w-full">
               <DataConnectorOption slug="github" />
+              <DataConnectorOption slug="youtube-transcript" />
             </div>
           </div>
         </div>
diff --git a/frontend/src/utils/directories.js b/frontend/src/utils/directories.js
index b2a1d493f..5a65b5336 100644
--- a/frontend/src/utils/directories.js
+++ b/frontend/src/utils/directories.js
@@ -13,7 +13,7 @@ export function getFileExtension(path) {
 
 export function middleTruncate(str, n) {
   const fileExtensionPattern = /([^.]*)$/;
-  const extensionMatch = str.match(fileExtensionPattern);
+  const extensionMatch = str.includes(".") && str.match(fileExtensionPattern);
 
   if (str.length <= n) return str;
 
diff --git a/frontend/src/utils/paths.js b/frontend/src/utils/paths.js
index c21c1500b..2812878fb 100644
--- a/frontend/src/utils/paths.js
+++ b/frontend/src/utils/paths.js
@@ -83,6 +83,9 @@ export default {
       github: () => {
         return "/settings/data-connectors/github";
       },
+      youtubeTranscript: () => {
+        return "/settings/data-connectors/youtube-transcript";
+      },
     },
   },
 };
diff --git a/server/endpoints/extensions/index.js b/server/endpoints/extensions/index.js
index fc545ce3c..1b3770374 100644
--- a/server/endpoints/extensions/index.js
+++ b/server/endpoints/extensions/index.js
@@ -48,6 +48,27 @@ function extensionEndpoints(app) {
       }
     }
   );
+
+  app.post(
+    "/ext/youtube/transcript",
+    [validatedRequest, flexUserRoleValid],
+    async (request, response) => {
+      try {
+        const responseFromProcessor = await forwardExtensionRequest({
+          endpoint: "/ext/youtube-transcript",
+          method: "POST",
+          body: request.body,
+        });
+        await Telemetry.sendTelemetry("extension_invoked", {
+          type: "youtube_transcript",
+        });
+        response.status(200).json(responseFromProcessor);
+      } catch (e) {
+        console.error(e);
+        response.sendStatus(500).end();
+      }
+    }
+  );
 }
 
 module.exports = { extensionEndpoints };
-- 
GitLab