From 42235fcd8a5fd5210554580ebbbb3004947119b3 Mon Sep 17 00:00:00 2001
From: Timothy Carambat <rambat1010@gmail.com>
Date: Tue, 23 Jul 2024 12:23:51 -0700
Subject: [PATCH] GitLab Hosted and Local Connector (#1932)

* Add support for GitLab repo collection as well as Github Repo collection
* Refactor for repo collectors to be more compact

---------

Co-authored-by: Emil Rofors <emirof@gmail.com>
---
 .github/workflows/dev-build.yaml              |   2 +-
 collector/extensions/index.js                 |  14 +-
 collector/extensions/resync/index.js          |   2 +-
 collector/package.json                        |   3 +-
 .../GithubRepo/RepoLoader/index.js            |  40 ++-
 .../{ => RepoLoader}/GithubRepo/index.js      |   6 +-
 .../RepoLoader/GitlabRepo/RepoLoader/index.js | 289 ++++++++++++++++
 .../extensions/RepoLoader/GitlabRepo/index.js | 145 ++++++++
 .../utils/extensions/RepoLoader/index.js      |  41 +++
 collector/yarn.lock                           |  14 +
 .../DataConnectorOption/media/gitlab.svg      |   7 +
 .../DataConnectorOption/media/index.js        |   2 +
 .../Connectors/Gitlab/index.jsx               | 310 ++++++++++++++++++
 .../ManageWorkspace/DataConnectors/index.jsx  |   8 +
 frontend/src/models/dataConnector.js          |  39 +++
 server/endpoints/extensions/index.js          |  27 +-
 .../middleware/isSupportedRepoProviders.js    |  12 +
 17 files changed, 939 insertions(+), 22 deletions(-)
 rename collector/utils/extensions/{ => RepoLoader}/GithubRepo/RepoLoader/index.js (80%)
 rename collector/utils/extensions/{ => RepoLoader}/GithubRepo/index.js (95%)
 create mode 100644 collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js
 create mode 100644 collector/utils/extensions/RepoLoader/GitlabRepo/index.js
 create mode 100644 collector/utils/extensions/RepoLoader/index.js
 create mode 100644 frontend/src/components/DataConnectorOption/media/gitlab.svg
 create mode 100644 frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Gitlab/index.jsx
 create mode 100644 server/utils/middleware/isSupportedRepoProviders.js

diff --git a/.github/workflows/dev-build.yaml b/.github/workflows/dev-build.yaml
index 40f4971c5..860ea5f6f 100644
--- a/.github/workflows/dev-build.yaml
+++ b/.github/workflows/dev-build.yaml
@@ -6,7 +6,7 @@ concurrency:
 
 on:
   push:
-    branches: ['1915-docker-perms'] # master branch only. Do not modify.
+    branches: ['-dev'] # put your current branch to create a build. Core team only.
     paths-ignore:
       - '**.md'
       - 'cloud-deployments/*'
diff --git a/collector/extensions/index.js b/collector/extensions/index.js
index a88b38eee..30beaa3e7 100644
--- a/collector/extensions/index.js
+++ b/collector/extensions/index.js
@@ -1,5 +1,6 @@
 const { setDataSigner } = require("../middleware/setDataSigner");
 const { verifyPayloadIntegrity } = require("../middleware/verifyIntegrity");
+const { resolveRepoLoader, resolveRepoLoaderFunction } = require("../utils/extensions/RepoLoader");
 const { reqBody } = require("../utils/http");
 const { validURL } = require("../utils/url");
 const RESYNC_METHODS = require("./resync");
@@ -28,15 +29,16 @@ function extensions(app) {
   )
 
   app.post(
-    "/ext/github-repo",
+    "/ext/:repo_platform-repo",
     [verifyPayloadIntegrity, setDataSigner],
     async function (request, response) {
       try {
-        const { loadGithubRepo } = require("../utils/extensions/GithubRepo");
-        const { success, reason, data } = await loadGithubRepo(
+        const loadRepo = resolveRepoLoaderFunction(request.params.repo_platform);
+        const { success, reason, data } = await loadRepo(
           reqBody(request),
           response,
         );
+        console.log({ success, reason, data })
         response.status(200).json({
           success,
           reason,
@@ -56,12 +58,12 @@ function extensions(app) {
 
   // gets all branches for a specific repo
   app.post(
-    "/ext/github-repo/branches",
+    "/ext/:repo_platform-repo/branches",
     [verifyPayloadIntegrity],
     async function (request, response) {
       try {
-        const GithubRepoLoader = require("../utils/extensions/GithubRepo/RepoLoader");
-        const allBranches = await new GithubRepoLoader(
+        const RepoLoader = resolveRepoLoader(request.params.repo_platform);
+        const allBranches = await new RepoLoader(
           reqBody(request)
         ).getRepoBranches();
         response.status(200).json({
diff --git a/collector/extensions/resync/index.js b/collector/extensions/resync/index.js
index ba967962e..66882ba7a 100644
--- a/collector/extensions/resync/index.js
+++ b/collector/extensions/resync/index.js
@@ -86,7 +86,7 @@ async function resyncGithub({ chunkSource }, response) {
     // Github file data is `payload` encrypted (might contain PAT). So we need to expand its
     // encrypted payload back into query params so we can reFetch the page with same access token/params.
     const source = response.locals.encryptionWorker.expandPayload(chunkSource);
-    const { fetchGithubFile } = require("../../utils/extensions/GithubRepo");
+    const { fetchGithubFile } = require("../../utils/extensions/RepoLoader/GithubRepo");
     const { success, reason, content } = await fetchGithubFile({
       repoUrl: `https:${source.pathname}`, // need to add back the real protocol
       branch: source.searchParams.get('branch'),
diff --git a/collector/package.json b/collector/package.json
index 5e3873d1e..cbc5ceed0 100644
--- a/collector/package.json
+++ b/collector/package.json
@@ -32,6 +32,7 @@
     "mammoth": "^1.6.0",
     "mbox-parser": "^1.0.1",
     "mime": "^3.0.0",
+    "minimatch": "5.1.0",
     "moment": "^2.29.4",
     "multer": "^1.4.5-lts.1",
     "node-html-parser": "^6.1.13",
@@ -50,4 +51,4 @@
     "nodemon": "^2.0.22",
     "prettier": "^2.4.1"
   }
-}
+}
\ No newline at end of file
diff --git a/collector/utils/extensions/GithubRepo/RepoLoader/index.js b/collector/utils/extensions/RepoLoader/GithubRepo/RepoLoader/index.js
similarity index 80%
rename from collector/utils/extensions/GithubRepo/RepoLoader/index.js
rename to collector/utils/extensions/RepoLoader/GithubRepo/RepoLoader/index.js
index af8a1dfc3..08121f44f 100644
--- a/collector/utils/extensions/GithubRepo/RepoLoader/index.js
+++ b/collector/utils/extensions/RepoLoader/GithubRepo/RepoLoader/index.js
@@ -1,4 +1,21 @@
-class RepoLoader {
+/**
+ * @typedef {Object} RepoLoaderArgs
+ * @property {string} repo - The GitHub repository URL.
+ * @property {string} [branch] - The branch to load from (optional).
+ * @property {string} [accessToken] - GitHub access token for authentication (optional).
+ * @property {string[]} [ignorePaths] - Array of paths to ignore when loading (optional).
+ */
+
+/**
+ * @class
+ * @classdesc Loads and manages GitHub repository content.
+ */
+class GitHubRepoLoader {
+  /**
+   * Creates an instance of RepoLoader.
+   * @param {RepoLoaderArgs} [args] - The configuration options.
+   * @returns {GitHubRepoLoader}
+   */
   constructor(args = {}) {
     this.ready = false;
     this.repo = args?.repo;
@@ -67,6 +84,10 @@ class RepoLoader {
     return;
   }
 
+  /**
+   * Initializes the RepoLoader instance.
+   * @returns {Promise<RepoLoader>} The initialized RepoLoader instance.
+   */
   async init() {
     if (!this.#validGithubUrl()) return;
     await this.#validBranch();
@@ -75,6 +96,11 @@ class RepoLoader {
     return this;
   }
 
+  /**
+   * Recursively loads the repository content.
+   * @returns {Promise<Array<Object>>} An array of loaded documents.
+   * @throws {Error} If the RepoLoader is not in a ready state.
+   */
   async recursiveLoader() {
     if (!this.ready) throw new Error("[Github Loader]: not in ready state!");
     const {
@@ -109,7 +135,10 @@ class RepoLoader {
     }, []);
   }
 
-  // Get all branches for a given repo.
+  /**
+   * Retrieves all branches for the repository.
+   * @returns {Promise<string[]>} An array of branch names.
+   */
   async getRepoBranches() {
     if (!this.#validGithubUrl() || !this.author || !this.project) return [];
     await this.#validateAccessToken(); // Ensure API access token is valid for pre-flight
@@ -151,6 +180,11 @@ class RepoLoader {
     return this.#branchPrefSort(this.branches);
   }
 
+  /**
+   * Fetches the content of a single file from the repository.
+   * @param {string} sourceFilePath - The path to the file in the repository.
+   * @returns {Promise<string|null>} The content of the file, or null if fetching fails.
+   */
   async fetchSingleFile(sourceFilePath) {
     try {
       return fetch(
@@ -182,4 +216,4 @@ class RepoLoader {
   }
 }
 
-module.exports = RepoLoader;
+module.exports = GitHubRepoLoader;
diff --git a/collector/utils/extensions/GithubRepo/index.js b/collector/utils/extensions/RepoLoader/GithubRepo/index.js
similarity index 95%
rename from collector/utils/extensions/GithubRepo/index.js
rename to collector/utils/extensions/RepoLoader/GithubRepo/index.js
index f40215cbe..41147278c 100644
--- a/collector/utils/extensions/GithubRepo/index.js
+++ b/collector/utils/extensions/RepoLoader/GithubRepo/index.js
@@ -3,8 +3,8 @@ const fs = require("fs");
 const path = require("path");
 const { default: slugify } = require("slugify");
 const { v4 } = require("uuid");
-const { writeToServerDocuments } = require("../../files");
-const { tokenizeString } = require("../../tokenizer");
+const { writeToServerDocuments } = require("../../../files");
+const { tokenizeString } = require("../../../tokenizer");
 
 /**
  * Load in a Github Repo recursively or just the top level if no PAT is provided
@@ -42,7 +42,7 @@ async function loadGithubRepo(args, response) {
     process.env.NODE_ENV === "development"
       ? path.resolve(
           __dirname,
-          `../../../../server/storage/documents/${outFolder}`
+          `../../../../../server/storage/documents/${outFolder}`
         )
       : path.resolve(process.env.STORAGE_DIR, `documents/${outFolder}`);
 
diff --git a/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js b/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js
new file mode 100644
index 000000000..c90932986
--- /dev/null
+++ b/collector/utils/extensions/RepoLoader/GitlabRepo/RepoLoader/index.js
@@ -0,0 +1,289 @@
+const minimatch = require("minimatch");
+
+/**
+ * @typedef {Object} RepoLoaderArgs
+ * @property {string} repo - The GitLab repository URL.
+ * @property {string} [branch] - The branch to load from (optional).
+ * @property {string} [accessToken] - GitLab access token for authentication (optional).
+ * @property {string[]} [ignorePaths] - Array of paths to ignore when loading (optional).
+ */
+
+/**
+ * @typedef {Object} FileTreeObject
+ * @property {string} id - The file object ID.
+ * @property {string} name - name of file.
+ * @property {('blob'|'tree')} type - type of file object.
+ * @property {string} path - path + name of file.
+ * @property {string} mode - Linux permission code.
+ */
+
+/**
+ * @class
+ * @classdesc Loads and manages GitLab repository content.
+ */
+class GitLabRepoLoader {
+  /**
+   * Creates an instance of RepoLoader.
+   * @param {RepoLoaderArgs} [args] - The configuration options.
+   * @returns {GitLabRepoLoader}
+   */
+  constructor(args = {}) {
+    this.ready = false;
+    this.repo = args?.repo;
+    this.branch = args?.branch;
+    this.accessToken = args?.accessToken || null;
+    this.ignorePaths = args?.ignorePaths || [];
+
+    this.projectId = null;
+    this.apiBase = "https://gitlab.com";
+    this.author = null;
+    this.project = null;
+    this.branches = [];
+  }
+
+  #validGitlabUrl() {
+    const UrlPattern = require("url-pattern");
+    const validPatterns = [
+      new UrlPattern("https\\://gitlab.com/(:projectId(*))", {
+        segmentValueCharset: "a-zA-Z0-9-._~%/+",
+      }),
+      // This should even match the regular hosted URL, but we may want to know
+      // if this was a hosted GitLab (above) or a self-hosted (below) instance
+      // since the API interface could be different.
+      new UrlPattern(
+        "(:protocol(http|https))\\://(:hostname*)/(:projectId(*))",
+        {
+          segmentValueCharset: "a-zA-Z0-9-._~%/+",
+        }
+      ),
+    ];
+
+    let match = null;
+    for (const pattern of validPatterns) {
+      if (match !== null) continue;
+      match = pattern.match(this.repo);
+    }
+    if (!match) return false;
+    const [author, project] = match.projectId.split("/");
+
+    this.projectId = encodeURIComponent(match.projectId);
+    this.apiBase = new URL(this.repo).origin;
+    this.author = author;
+    this.project = project;
+    return true;
+  }
+
+  async #validBranch() {
+    await this.getRepoBranches();
+    if (!!this.branch && this.branches.includes(this.branch)) return;
+
+    console.log(
+      "[Gitlab Loader]: Branch not set! Auto-assigning to a default branch."
+    );
+    this.branch = this.branches.includes("main") ? "main" : "master";
+    console.log(`[Gitlab Loader]: Branch auto-assigned to ${this.branch}.`);
+    return;
+  }
+
+  async #validateAccessToken() {
+    if (!this.accessToken) return;
+    try {
+      await fetch(`${this.apiBase}/api/v4/user`, {
+        method: "GET",
+        headers: this.accessToken ? { "PRIVATE-TOKEN": this.accessToken } : {},
+      }).then((res) => res.ok);
+    } catch (e) {
+      console.error(
+        "Invalid Gitlab Access Token provided! Access token will not be used",
+        e.message
+      );
+      this.accessToken = null;
+    }
+  }
+
+  /**
+   * Initializes the RepoLoader instance.
+   * @returns {Promise<RepoLoader>} The initialized RepoLoader instance.
+   */
+  async init() {
+    if (!this.#validGitlabUrl()) return;
+    await this.#validBranch();
+    await this.#validateAccessToken();
+    this.ready = true;
+    return this;
+  }
+
+  /**
+   * Recursively loads the repository content.
+   * @returns {Promise<Array<Object>>} An array of loaded documents.
+   * @throws {Error} If the RepoLoader is not in a ready state.
+   */
+  async recursiveLoader() {
+    if (!this.ready) throw new Error("[Gitlab Loader]: not in ready state!");
+
+    if (this.accessToken)
+      console.log(
+        `[Gitlab Loader]: Access token set! Recursive loading enabled!`
+      );
+
+    const files = await this.fetchFilesRecursive();
+    const docs = [];
+
+    for (const file of files) {
+      if (this.ignorePaths.some((path) => file.path.includes(path))) continue;
+
+      const content = await this.fetchSingleFileContents(file.path);
+      if (content) {
+        docs.push({
+          pageContent: content,
+          metadata: { source: file.path },
+        });
+      }
+    }
+
+    return docs;
+  }
+
+  #branchPrefSort(branches = []) {
+    const preferredSort = ["main", "master"];
+    return branches.reduce((acc, branch) => {
+      if (preferredSort.includes(branch)) return [branch, ...acc];
+      return [...acc, branch];
+    }, []);
+  }
+
+  /**
+   * Retrieves all branches for the repository.
+   * @returns {Promise<string[]>} An array of branch names.
+   */
+  async getRepoBranches() {
+    if (!this.#validGitlabUrl() || !this.projectId) return [];
+    await this.#validateAccessToken();
+
+    try {
+      this.branches = await fetch(
+        `${this.apiBase}/api/v4/projects/${this.projectId}/repository/branches`,
+        {
+          method: "GET",
+          headers: {
+            Accepts: "application/json",
+            ...(this.accessToken ? { "PRIVATE-TOKEN": this.accessToken } : {}),
+          },
+        }
+      )
+        .then((res) => res.json())
+        .then((branches) => {
+          return branches.map((b) => b.name);
+        })
+        .catch((e) => {
+          console.error(e);
+          return [];
+        });
+
+      return this.#branchPrefSort(this.branches);
+    } catch (err) {
+      console.log(`RepoLoader.getRepoBranches`, err);
+      this.branches = [];
+      return [];
+    }
+  }
+
+  /**
+   * Returns list of all file objects from tree API for GitLab
+   * @returns {Promise<FileTreeObject[]>}
+   */
+  async fetchFilesRecursive() {
+    const files = [];
+    let perPage = 100;
+    let fetching = true;
+    let page = 1;
+
+    while (fetching) {
+      try {
+        const params = new URLSearchParams({
+          ref: this.branch,
+          recursive: true,
+          per_page: perPage,
+          page,
+        });
+        const queryUrl = `${this.apiBase}/api/v4/projects/${
+          this.projectId
+        }/repository/tree?${params.toString()}`;
+        const response = await fetch(queryUrl, {
+          method: "GET",
+          headers: this.accessToken
+            ? { "PRIVATE-TOKEN": this.accessToken }
+            : {},
+        });
+        const totalPages = Number(response.headers.get("x-total-pages"));
+        const nextPage = Number(response.headers.get("x-next-page"));
+        const data = await response.json();
+
+        /** @type {FileTreeObject[]} */
+        const objects = Array.isArray(data)
+          ? data.filter((item) => item.type === "blob")
+          : []; // only get files, not paths or submodules
+        if (objects.length === 0) {
+          fetching = false;
+          break;
+        }
+
+        // Apply ignore path rules to found objects. If any rules match it is an invalid file path.
+        console.log(
+          `Found ${objects.length} blobs from repo from pg ${page}/${totalPages}`
+        );
+        for (const file of objects) {
+          const isIgnored = this.ignorePaths.some((ignorePattern) =>
+            minimatch(file.path, ignorePattern, { matchBase: true })
+          );
+          if (!isIgnored) files.push(file);
+        }
+
+        if (page === totalPages) {
+          fetching = false;
+          break;
+        }
+
+        page = Number(nextPage);
+      } catch (e) {
+        console.error(`RepoLoader.getRepositoryTree`, e);
+        fetching = false;
+        break;
+      }
+    }
+    return files;
+  }
+
+  /**
+   * Fetches the content of a single file from the repository.
+   * @param {string} sourceFilePath - The path to the file in the repository.
+   * @returns {Promise<string|null>} The content of the file, or null if fetching fails.
+   */
+  async fetchSingleFileContents(sourceFilePath) {
+    try {
+      const data = await fetch(
+        `${this.apiBase}/api/v4/projects/${
+          this.projectId
+        }/repository/files/${encodeURIComponent(sourceFilePath)}/raw?ref=${
+          this.branch
+        }`,
+        {
+          method: "GET",
+          headers: this.accessToken
+            ? { "PRIVATE-TOKEN": this.accessToken }
+            : {},
+        }
+      ).then((res) => {
+        if (res.ok) return res.text();
+        throw new Error(`Failed to fetch single file ${sourceFilePath}`);
+      });
+
+      return data;
+    } catch (e) {
+      console.error(`RepoLoader.fetchSingleFileContents`, e);
+      return null;
+    }
+  }
+}
+
+module.exports = GitLabRepoLoader;
diff --git a/collector/utils/extensions/RepoLoader/GitlabRepo/index.js b/collector/utils/extensions/RepoLoader/GitlabRepo/index.js
new file mode 100644
index 000000000..e756463c7
--- /dev/null
+++ b/collector/utils/extensions/RepoLoader/GitlabRepo/index.js
@@ -0,0 +1,145 @@
+const RepoLoader = require("./RepoLoader");
+const fs = require("fs");
+const path = require("path");
+const { default: slugify } = require("slugify");
+const { v4 } = require("uuid");
+const { writeToServerDocuments } = require("../../../files");
+const { tokenizeString } = require("../../../tokenizer");
+
+/**
+ * Load in a Gitlab Repo recursively or just the top level if no PAT is provided
+ * @param {object} args - forwarded request body params
+ * @param {import("../../../middleware/setDataSigner").ResponseWithSigner} response - Express response object with encryptionWorker
+ * @returns
+ */
+async function loadGitlabRepo(args, response) {
+  const repo = new RepoLoader(args);
+  await repo.init();
+
+  if (!repo.ready)
+    return {
+      success: false,
+      reason: "Could not prepare Gitlab repo for loading! Check URL",
+    };
+
+  console.log(
+    `-- Working GitLab ${repo.author}/${repo.project}:${repo.branch} --`
+  );
+  const docs = await repo.recursiveLoader();
+  if (!docs.length) {
+    return {
+      success: false,
+      reason: "No files were found for those settings.",
+    };
+  }
+
+  console.log(`[GitLab Loader]: Found ${docs.length} source files. Saving...`);
+  const outFolder = slugify(
+    `${repo.author}-${repo.project}-${repo.branch}-${v4().slice(0, 4)}`
+  ).toLowerCase();
+
+  const outFolderPath =
+    process.env.NODE_ENV === "development"
+      ? path.resolve(
+          __dirname,
+          `../../../../../server/storage/documents/${outFolder}`
+        )
+      : path.resolve(process.env.STORAGE_DIR, `documents/${outFolder}`);
+
+  if (!fs.existsSync(outFolderPath))
+    fs.mkdirSync(outFolderPath, { recursive: true });
+
+  for (const doc of docs) {
+    if (!doc.pageContent) continue;
+    const data = {
+      id: v4(),
+      url: "gitlab://" + doc.metadata.source,
+      title: doc.metadata.source,
+      docAuthor: repo.author,
+      description: "No description found.",
+      docSource: doc.metadata.source,
+      chunkSource: generateChunkSource(
+        repo,
+        doc,
+        response.locals.encryptionWorker
+      ),
+      published: new Date().toLocaleString(),
+      wordCount: doc.pageContent.split(" ").length,
+      pageContent: doc.pageContent,
+      token_count_estimate: tokenizeString(doc.pageContent).length,
+    };
+    console.log(
+      `[GitLab Loader]: Saving ${doc.metadata.source} to ${outFolder}`
+    );
+    writeToServerDocuments(
+      data,
+      `${slugify(doc.metadata.source)}-${data.id}`,
+      outFolderPath
+    );
+  }
+
+  return {
+    success: true,
+    reason: null,
+    data: {
+      author: repo.author,
+      repo: repo.project,
+      projectId: repo.projectId,
+      branch: repo.branch,
+      files: docs.length,
+      destination: outFolder,
+    },
+  };
+}
+
+async function fetchGitlabFile({
+  repoUrl,
+  branch,
+  accessToken = null,
+  sourceFilePath,
+}) {
+  const repo = new RepoLoader({
+    repo: repoUrl,
+    branch,
+    accessToken,
+  });
+  await repo.init();
+
+  if (!repo.ready)
+    return {
+      success: false,
+      content: null,
+      reason: "Could not prepare GitLab repo for loading! Check URL or PAT.",
+    };
+  console.log(
+    `-- Working GitLab ${repo.author}/${repo.project}:${repo.branch} file:${sourceFilePath} --`
+  );
+  const fileContent = await repo.fetchSingleFile(sourceFilePath);
+  if (!fileContent) {
+    return {
+      success: false,
+      reason: "Target file returned a null content response.",
+      content: null,
+    };
+  }
+
+  return {
+    success: true,
+    reason: null,
+    content: fileContent,
+  };
+}
+
+function generateChunkSource(repo, doc, encryptionWorker) {
+  const payload = {
+    projectId: decodeURIComponent(repo.projectId),
+    branch: repo.branch,
+    path: doc.metadata.source,
+    pat: !!repo.accessToken ? repo.accessToken : null,
+  };
+  return `gitlab://${repo.repo}?payload=${encryptionWorker.encrypt(
+    JSON.stringify(payload)
+  )}`;
+}
+
+module.exports = { loadGitlabRepo, fetchGitlabFile };
diff --git a/collector/utils/extensions/RepoLoader/index.js b/collector/utils/extensions/RepoLoader/index.js
new file mode 100644
index 000000000..6395e889e
--- /dev/null
+++ b/collector/utils/extensions/RepoLoader/index.js
@@ -0,0 +1,41 @@
+/**
+ * Dynamically load the correct repository loader from a specific platform
+ * by default will return Github.
+ * @param {('github'|'gitlab')} platform
+ * @returns {import("./GithubRepo/RepoLoader")|import("./GitlabRepo/RepoLoader")} the repo loader class for provider
+ */
+function resolveRepoLoader(platform = "github") {
+  switch (platform) {
+    case "github":
+      console.log(`Loading GitHub RepoLoader...`);
+      return require("./GithubRepo/RepoLoader");
+    case "gitlab":
+      console.log(`Loading GitLab RepoLoader...`);
+      return require("./GitlabRepo/RepoLoader");
+    default:
+      console.log(`Loading GitHub RepoLoader...`);
+      return require("./GithubRepo/RepoLoader");
+  }
+}
+
+/**
+ * Dynamically load the correct repository loader function from a specific platform
+ * by default will return Github.
+ * @param {('github'|'gitlab')} platform
+ * @returns {import("./GithubRepo")['fetchGithubFile'] | import("./GitlabRepo")['fetchGitlabFile']} the repo loader class for provider
+ */
+function resolveRepoLoaderFunction(platform = "github") {
+  switch (platform) {
+    case "github":
+      console.log(`Loading GitHub loader function...`);
+      return require("./GithubRepo").loadGithubRepo;
+    case "gitlab":
+      console.log(`Loading GitLab loader function...`);
+      return require("./GitlabRepo").loadGitlabRepo;
+    default:
+      console.log(`Loading GitHub loader function...`);
+      return require("./GithubRepo").loadGithubRepo;
+  }
+}
+
+module.exports = { resolveRepoLoader, resolveRepoLoaderFunction };
diff --git a/collector/yarn.lock b/collector/yarn.lock
index 24dfd435f..68d0181a6 100644
--- a/collector/yarn.lock
+++ b/collector/yarn.lock
@@ -581,6 +581,13 @@ brace-expansion@^1.1.7:
     balanced-match "^1.0.0"
     concat-map "0.0.1"
 
+brace-expansion@^2.0.1:
+  version "2.0.1"
+  resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-2.0.1.tgz#1edc459e0f0c548486ecf9fc99f2221364b9a0ae"
+  integrity sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==
+  dependencies:
+    balanced-match "^1.0.0"
+
 braces@~3.0.2:
   version "3.0.2"
   resolved "https://registry.yarnpkg.com/braces/-/braces-3.0.2.tgz#3454e1a462ee8d599e236df336cd9ea4f8afe107"
@@ -2226,6 +2233,13 @@ mimic-response@^3.1.0:
   resolved "https://registry.yarnpkg.com/mimic-response/-/mimic-response-3.1.0.tgz#2d1d59af9c1b129815accc2c46a022a5ce1fa3c9"
   integrity sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==
 
+minimatch@5.1.0:
+  version "5.1.0"
+  resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-5.1.0.tgz#1717b464f4971b144f6aabe8f2d0b8e4511e09c7"
+  integrity sha512-9TPBGGak4nHfGZsPBohm9AWg6NoT7QTCehS3BIJABslyZbzxfV78QM2Y6+i741OPZIafFAaiiEMh5OyIrJPgtg==
+  dependencies:
+    brace-expansion "^2.0.1"
+
 minimatch@^3.1.1, minimatch@^3.1.2:
   version "3.1.2"
   resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.1.2.tgz#19cd194bfd3e428f049a70817c038d89ab4be35b"
diff --git a/frontend/src/components/DataConnectorOption/media/gitlab.svg b/frontend/src/components/DataConnectorOption/media/gitlab.svg
new file mode 100644
index 000000000..0d48a00cb
--- /dev/null
+++ b/frontend/src/components/DataConnectorOption/media/gitlab.svg
@@ -0,0 +1,7 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 380 380">
+  <rect width="380" height="380" fill="#FFFFFF"/>
+  <path fill="#e24329" d="M282.83,170.73l-.27-.69-26.14-68.22a6.81,6.81,0,0,0-2.69-3.24,7,7,0,0,0-8,.43,7,7,0,0,0-2.32,3.52l-17.65,54H154.29l-17.65-54A6.86,6.86,0,0,0,134.32,99a7,7,0,0,0-8-.43,6.87,6.87,0,0,0-2.69,3.24L97.44,170l-.26.69a48.54,48.54,0,0,0,16.1,56.1l.09.07.24.17,39.82,29.82,19.7,14.91,12,9.06a8.07,8.07,0,0,0,9.76,0l12-9.06,19.7-14.91,40.06-30,.1-.08A48.56,48.56,0,0,0,282.83,170.73Z"/>
+  <path fill="#fc6d26" d="M282.83,170.73l-.27-.69a88.3,88.3,0,0,0-35.15,15.8L190,229.25c19.55,14.79,36.57,27.64,36.57,27.64l40.06-30,.1-.08A48.56,48.56,0,0,0,282.83,170.73Z"/>
+  <path fill="#fca326" d="M153.43,256.89l19.7,14.91,12,9.06a8.07,8.07,0,0,0,9.76,0l12-9.06,19.7-14.91S209.55,244,190,229.25C170.45,244,153.43,256.89,153.43,256.89Z"/>
+  <path fill="#fc6d26" d="M132.58,185.84A88.19,88.19,0,0,0,97.44,170l-.26.69a48.54,48.54,0,0,0,16.1,56.1l.09.07.24.17,39.82,29.82s17-12.85,36.57-27.64Z"/>
+</svg>
diff --git a/frontend/src/components/DataConnectorOption/media/index.js b/frontend/src/components/DataConnectorOption/media/index.js
index dee46a12b..cbc80b642 100644
--- a/frontend/src/components/DataConnectorOption/media/index.js
+++ b/frontend/src/components/DataConnectorOption/media/index.js
@@ -1,10 +1,12 @@
 import Github from "./github.svg";
+import Gitlab from "./gitlab.svg";
 import YouTube from "./youtube.svg";
 import Link from "./link.svg";
 import Confluence from "./confluence.jpeg";
 
 const ConnectorImages = {
   github: Github,
+  gitlab: Gitlab,
   youtube: YouTube,
   websiteDepth: Link,
   confluence: Confluence,
diff --git a/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Gitlab/index.jsx b/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Gitlab/index.jsx
new file mode 100644
index 000000000..f3c34dc8a
--- /dev/null
+++ b/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/Gitlab/index.jsx
@@ -0,0 +1,310 @@
+import React, { useEffect, useState } from "react";
+import System from "@/models/system";
+import showToast from "@/utils/toast";
+import pluralize from "pluralize";
+import { TagsInput } from "react-tag-input-component";
+import { Info, Warning } from "@phosphor-icons/react";
+import { Tooltip } from "react-tooltip";
+
+const DEFAULT_BRANCHES = ["main", "master"];
+export default function GitlabOptions() {
+  const [loading, setLoading] = useState(false);
+  const [repo, setRepo] = useState(null);
+  const [accessToken, setAccessToken] = useState(null);
+  const [ignores, setIgnores] = useState([]);
+  const [settings, setSettings] = useState({
+    repo: null,
+    accessToken: null,
+  });
+
+  const handleSubmit = async (e) => {
+    e.preventDefault();
+    const form = new FormData(e.target);
+
+    try {
+      setLoading(true);
+      showToast(
+        `Fetching all files for repo ${repo} - this may take a while.`,
+        "info",
+        { clear: true, autoClose: false }
+      );
+
+      const { data, error } = await System.dataConnectors.gitlab.collect({
+        repo: form.get("repo"),
+        accessToken: form.get("accessToken"),
+        branch: form.get("branch"),
+        ignorePaths: ignores,
+      });
+
+      if (!!error) {
+        showToast(error, "error", { clear: true });
+        setLoading(false);
+        return;
+      }
+
+      showToast(
+        `${data.files} ${pluralize("file", data.files)} collected from ${
+          data.author
+        }/${data.repo}:${data.branch}. Output folder is ${data.destination}.`,
+        "success",
+        { clear: true }
+      );
+      e.target.reset();
+      setLoading(false);
+      return;
+    } catch (e) {
+      console.error(e);
+      showToast(e.message, "error", { clear: true });
+      setLoading(false);
+    }
+  };
+
+  return (
+    <div className="flex w-full">
+      <div className="flex flex-col w-full px-1 md:pb-6 pb-16">
+        <form className="w-full" onSubmit={handleSubmit}>
+          <div className="w-full flex flex-col py-2">
+            <div className="w-full flex flex-col gap-4">
+              <div className="flex flex-col pr-10">
+                <div className="flex flex-col gap-y-1 mb-4">
+                  <label className="text-white text-sm font-bold">
+                    GitLab Repo URL
+                  </label>
+                  <p className="text-xs font-normal text-white/50">
+                    URL of the GitLab repo you wish to collect.
+                  </p>
+                </div>
+                <input
+                  type="url"
+                  name="repo"
+                  className="border-none bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5"
+                  placeholder="https://gitlab.com/gitlab-org/gitlab"
+                  required={true}
+                  autoComplete="off"
+                  onChange={(e) => setRepo(e.target.value)}
+                  onBlur={() => setSettings({ ...settings, repo })}
+                  spellCheck={false}
+                  rows={2}
+                />
+              </div>
+              <div className="flex flex-col pr-10">
+                <div className="flex flex-col gap-y-1 mb-4">
+                  <label className="text-white font-bold text-sm flex gap-x-2 items-center">
+                    <p className="font-bold text-white">GitLab Access Token</p>{" "}
+                    <p className="text-xs text-white/50 font-light flex items-center">
+                      optional
+                      <PATTooltip accessToken={accessToken} />
+                    </p>
+                  </label>
+                  <p className="text-xs font-normal text-white/50">
+                    Access Token to prevent rate limiting.
+                  </p>
+                </div>
+                <input
+                  type="text"
+                  name="accessToken"
+                  className="border-none bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white block w-full p-2.5"
+                  placeholder="glpat-XXXXXXXXXXXXXXXXXXXX"
+                  required={false}
+                  autoComplete="off"
+                  spellCheck={false}
+                  onChange={(e) => setAccessToken(e.target.value)}
+                  onBlur={() => setSettings({ ...settings, accessToken })}
+                />
+              </div>
+              <GitLabBranchSelection
+                repo={settings.repo}
+                accessToken={settings.accessToken}
+              />
+            </div>
+
+            <div className="flex flex-col w-full py-4 pr-10">
+              <div className="flex flex-col gap-y-1 mb-4">
+                <label className="text-white text-sm flex gap-x-2 items-center">
+                  <p className="text-white text-sm font-bold">File Ignores</p>
+                </label>
+                <p className="text-xs font-normal text-white/50">
+                  List in .gitignore format to ignore specific files during
+                  collection. Press enter after each entry you want to save.
+                </p>
+              </div>
+              <TagsInput
+                value={ignores}
+                onChange={setIgnores}
+                name="ignores"
+                placeholder="!*.js, images/*, .DS_Store, bin/*"
+                classNames={{
+                  tag: "bg-blue-300/10 text-zinc-800",
+                  input:
+                    "flex bg-zinc-900 text-white placeholder:text-white/20 text-sm rounded-lg focus:border-white",
+                }}
+              />
+            </div>
+          </div>
+
+          <div className="flex flex-col gap-y-2 w-full pr-10">
+            <PATAlert accessToken={accessToken} />
+            <button
+              type="submit"
+              disabled={loading}
+              className="mt-2 w-full justify-center border border-slate-200 px-4 py-2 rounded-lg text-dark-text text-sm font-bold items-center flex gap-x-2 bg-slate-200 hover:bg-slate-300 hover:text-slate-800 disabled:bg-slate-300 disabled:cursor-not-allowed"
+            >
+              {loading ? "Collecting files..." : "Submit"}
+            </button>
+            {loading && (
+              <p className="text-xs text-white/50">
+                Once complete, all files will be available for embedding into
+                workspaces in the document picker.
+              </p>
+            )}
+          </div>
+        </form>
+      </div>
+    </div>
+  );
+}
+
+function GitLabBranchSelection({ repo, accessToken }) {
+  const [allBranches, setAllBranches] = useState(DEFAULT_BRANCHES);
+  const [loading, setLoading] = useState(true);
+
+  useEffect(() => {
+    async function fetchAllBranches() {
+      if (!repo) {
+        setAllBranches(DEFAULT_BRANCHES);
+        setLoading(false);
+        return;
+      }
+
+      setLoading(true);
+      const { branches } = await System.dataConnectors.gitlab.branches({
+        repo,
+        accessToken,
+      });
+      setAllBranches(branches.length > 0 ? branches : DEFAULT_BRANCHES);
+      setLoading(false);
+    }
+    fetchAllBranches();
+  }, [repo, accessToken]);
+
+  if (loading) {
+    return (
+      <div className="flex flex-col w-60">
+        <div className="flex flex-col gap-y-1 mb-4">
+          <label className="text-white text-sm font-bold">Branch</label>
+          <p className="text-xs font-normal text-white/50">
+            Branch you wish to collect files from.
+          </p>
+        </div>
+        <select
+          name="branch"
+          required={true}
+          className="bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
+        >
+          <option disabled={true} selected={true}>
+            -- loading available branches --
+          </option>
+        </select>
+      </div>
+    );
+  }
+
+  return (
+    <div className="flex flex-col w-60">
+      <div className="flex flex-col gap-y-1 mb-4">
+        <label className="text-white text-sm font-bold">Branch</label>
+        <p className="text-xs font-normal text-white/50">
+          Branch you wish to collect files from.
+        </p>
+      </div>
+      <select
+        name="branch"
+        required={true}
+        className="bg-zinc-900 border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
+      >
+        {allBranches.map((branch) => {
+          return (
+            <option key={branch} value={branch}>
+              {branch}
+            </option>
+          );
+        })}
+      </select>
+    </div>
+  );
+}
+
+function PATAlert({ accessToken }) {
+  if (!!accessToken) return null;
+  return (
+    <div className="flex flex-col md:flex-row md:items-center gap-x-2 text-white mb-4 bg-blue-800/30 w-fit rounded-lg px-4 py-2">
+      <div className="gap-x-2 flex items-center">
+        <Info className="shrink-0" size={25} />
+        <p className="text-sm">
+          Without filling out the <b>GitLab Access Token</b> this data connector
+          will only be able to collect the <b>top-level</b> files of the repo
+          due to GitLab's public API rate-limits.
+          <br />
+          <br />
+          <a
+            href="https://gitlab.com/-/profile/personal_access_tokens"
+            rel="noreferrer"
+            target="_blank"
+            className="underline"
+            onClick={(e) => e.stopPropagation()}
+          >
+            {" "}
+            Get a free Personal Access Token with a GitLab account here.
+          </a>
+        </p>
+      </div>
+    </div>
+  );
+}
+
+function PATTooltip({ accessToken }) {
+  if (!!accessToken) return null;
+  return (
+    <>
+      {!accessToken && (
+        <Warning
+          size={14}
+          className="ml-1 text-orange-500 cursor-pointer"
+          data-tooltip-id="access-token-tooltip"
+          data-tooltip-place="right"
+        />
+      )}
+      <Tooltip
+        delayHide={300}
+        id="access-token-tooltip"
+        className="max-w-xs"
+        clickable={true}
+      >
+        <p className="text-sm">
+          Without a{" "}
+          <a
+            href="https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html"
+            rel="noreferrer"
+            target="_blank"
+            className="underline"
+            onClick={(e) => e.stopPropagation()}
+          >
+            Personal Access Token
+          </a>
+          , the GitLab API may limit the number of files that can be collected
+          due to rate limits. You can{" "}
+          <a
+            href="https://gitlab.com/-/profile/personal_access_tokens"
+            rel="noreferrer"
+            target="_blank"
+            className="underline"
+            onClick={(e) => e.stopPropagation()}
+          >
+            create a temporary Access Token
+          </a>{" "}
+          to avoid this issue.
+        </p>
+      </Tooltip>
+    </>
+  );
+}
diff --git a/frontend/src/components/Modals/ManageWorkspace/DataConnectors/index.jsx b/frontend/src/components/Modals/ManageWorkspace/DataConnectors/index.jsx
index c2c14dff3..9df6dd7d8 100644
--- a/frontend/src/components/Modals/ManageWorkspace/DataConnectors/index.jsx
+++ b/frontend/src/components/Modals/ManageWorkspace/DataConnectors/index.jsx
@@ -1,6 +1,7 @@
 import ConnectorImages from "@/components/DataConnectorOption/media";
 import { MagnifyingGlass } from "@phosphor-icons/react";
 import GithubOptions from "./Connectors/Github";
+import GitlabOptions from "./Connectors/Gitlab";
 import YoutubeOptions from "./Connectors/Youtube";
 import ConfluenceOptions from "./Connectors/Confluence";
 import { useState } from "react";
@@ -15,6 +16,13 @@ export const DATA_CONNECTORS = {
       "Import an entire public or private Github repository in a single click.",
     options: <GithubOptions />,
   },
+  gitlab: {
+    name: "GitLab Repo",
+    image: ConnectorImages.gitlab,
+    description:
+      "Import an entire public or private GitLab repository in a single click.",
+    options: <GitlabOptions />,
+  },
   "youtube-transcript": {
     name: "YouTube Transcript",
     image: ConnectorImages.youtube,
diff --git a/frontend/src/models/dataConnector.js b/frontend/src/models/dataConnector.js
index d01c3c8b8..c363835c8 100644
--- a/frontend/src/models/dataConnector.js
+++ b/frontend/src/models/dataConnector.js
@@ -42,6 +42,45 @@ const DataConnector = {
         });
     },
   },
+  gitlab: {
+    branches: async ({ repo, accessToken }) => {
+      return await fetch(`${API_BASE}/ext/gitlab/branches`, {
+        method: "POST",
+        headers: baseHeaders(),
+        cache: "force-cache",
+        body: JSON.stringify({ repo, accessToken }),
+      })
+        .then((res) => res.json())
+        .then((res) => {
+          if (!res.success) throw new Error(res.reason);
+          return res.data;
+        })
+        .then((data) => {
+          return { branches: data?.branches || [], error: null };
+        })
+        .catch((e) => {
+          console.error(e);
+          showToast(e.message, "error");
+          return { branches: [], error: e.message };
+        });
+    },
+    collect: async function ({ repo, accessToken, branch, ignorePaths = [] }) {
+      return await fetch(`${API_BASE}/ext/gitlab/repo`, {
+        method: "POST",
+        headers: baseHeaders(),
+        body: JSON.stringify({ repo, accessToken, branch, ignorePaths }),
+      })
+        .then((res) => res.json())
+        .then((res) => {
+          if (!res.success) throw new Error(res.reason);
+          return { data: res.data, error: null };
+        })
+        .catch((e) => {
+          console.error(e);
+          return { data: null, error: e.message };
+        });
+    },
+  },
   youtube: {
     transcribe: async ({ url }) => {
       return await fetch(`${API_BASE}/ext/youtube/transcript`, {
diff --git a/server/endpoints/extensions/index.js b/server/endpoints/extensions/index.js
index cf8e1191c..8f836ce07 100644
--- a/server/endpoints/extensions/index.js
+++ b/server/endpoints/extensions/index.js
@@ -5,18 +5,26 @@ const {
   ROLES,
 } = require("../../utils/middleware/multiUserProtected");
 const { validatedRequest } = require("../../utils/middleware/validatedRequest");
+const {
+  isSupportedRepoProvider,
+} = require("../../utils/middleware/isSupportedRepoProviders");
 
 function extensionEndpoints(app) {
   if (!app) return;
 
   app.post(
-    "/ext/github/branches",
-    [validatedRequest, flexUserRoleValid([ROLES.admin, ROLES.manager])],
+    "/ext/:repo_platform/branches",
+    [
+      validatedRequest,
+      flexUserRoleValid([ROLES.admin, ROLES.manager]),
+      isSupportedRepoProvider,
+    ],
     async (request, response) => {
       try {
+        const { repo_platform } = request.params;
         const responseFromProcessor =
           await new CollectorApi().forwardExtensionRequest({
-            endpoint: "/ext/github-repo/branches",
+            endpoint: `/ext/${repo_platform}-repo/branches`,
             method: "POST",
             body: request.body,
           });
@@ -29,18 +37,23 @@ function extensionEndpoints(app) {
   );
 
   app.post(
-    "/ext/github/repo",
-    [validatedRequest, flexUserRoleValid([ROLES.admin, ROLES.manager])],
+    "/ext/:repo_platform/repo",
+    [
+      validatedRequest,
+      flexUserRoleValid([ROLES.admin, ROLES.manager]),
+      isSupportedRepoProvider,
+    ],
     async (request, response) => {
       try {
+        const { repo_platform } = request.params;
         const responseFromProcessor =
           await new CollectorApi().forwardExtensionRequest({
-            endpoint: "/ext/github-repo",
+            endpoint: `/ext/${repo_platform}-repo`,
             method: "POST",
             body: request.body,
           });
         await Telemetry.sendTelemetry("extension_invoked", {
-          type: "github_repo",
+          type: `${repo_platform}_repo`,
         });
         response.status(200).json(responseFromProcessor);
       } catch (e) {
diff --git a/server/utils/middleware/isSupportedRepoProviders.js b/server/utils/middleware/isSupportedRepoProviders.js
new file mode 100644
index 000000000..6a5cfb169
--- /dev/null
+++ b/server/utils/middleware/isSupportedRepoProviders.js
@@ -0,0 +1,12 @@
+// Middleware to validate that a repo provider URL is supported.
+const REPO_PLATFORMS = ["github", "gitlab"];
+
+function isSupportedRepoProvider(request, response, next) {
+  const { repo_platform = null } = request.params;
+  if (!repo_platform || !REPO_PLATFORMS.includes(repo_platform))
+    return response
+      .status(500)
+      .text(`Unsupported repo platform ${repo_platform}`);
+  next();
+}
+module.exports = { isSupportedRepoProvider };
-- 
GitLab