From be3b0b491676da816f38daa007060837eab08333 Mon Sep 17 00:00:00 2001
From: Sean Hatfield <seanhatfield5@gmail.com>
Date: Tue, 6 Aug 2024 10:16:17 -0700
Subject: [PATCH] Youtube loader whitespace fix (#2051)

youtube loader whitespace fix
---
 .../YoutubeTranscript/YoutubeLoader/youtube-transcript.js   | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js b/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js
index c81c0ec56..f868875b2 100644
--- a/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js
+++ b/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js
@@ -47,10 +47,12 @@ class YoutubeTranscript {
       let transcript = "";
       const chunks = transcriptXML.getElementsByTagName("text");
       for (const chunk of chunks) {
-        transcript += chunk.textContent;
+        // Add space after each text chunk
+        transcript += chunk.textContent + " ";
       }
 
-      return transcript;
+      // Trim extra whitespace
+      return transcript.trim().replace(/\s+/g, " ");
     } catch (e) {
       throw new YoutubeTranscriptError(e);
     }
-- 
GitLab