From 78e3e35d270fcc1c3c0c1ee98092c37d2fcda623 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Predrag=20Stojadinovi=C4=87?=
 <cope@users.noreply.github.com>
Date: Tue, 14 May 2024 19:21:04 +0200
Subject: [PATCH] [FEAT] Confluence Data Connector handles custom Confluence
 urls (#1362)

* chore: confluence data connector can now handle custom urls, in addition to default {subdomain}.atlassian.net ones

* chore: formatting as per yarn lint
---
 .../utils/extensions/Confluence/index.js      | 42 ++++++++++++++-----
 1 file changed, 32 insertions(+), 10 deletions(-)

diff --git a/collector/utils/extensions/Confluence/index.js b/collector/utils/extensions/Confluence/index.js
index 5a473f654..5bb44deeb 100644
--- a/collector/utils/extensions/Confluence/index.js
+++ b/collector/utils/extensions/Confluence/index.js
@@ -2,6 +2,7 @@ const fs = require("fs");
 const path = require("path");
 const { default: slugify } = require("slugify");
 const { v4 } = require("uuid");
+const UrlPattern = require("url-pattern");
 const { writeToServerDocuments } = require("../../files");
 const { tokenizeString } = require("../../tokenizer");
 const {
@@ -9,13 +10,29 @@ const {
 } = require("langchain/document_loaders/web/confluence");
 
 function validSpaceUrl(spaceUrl = "") {
-  const UrlPattern = require("url-pattern");
-  const pattern = new UrlPattern(
-    "https\\://(:subdomain).atlassian.net/wiki/spaces/(:spaceKey)*"
+  // Atlassian default URL match
+  const atlassianPattern = new UrlPattern(
+    "https\\://(:subdomain).atlassian.net/wiki/spaces/(:spaceKey)/*"
   );
-  const match = pattern.match(spaceUrl);
-  if (!match) return { valid: false, result: null };
-  return { valid: true, result: match };
+  const atlassianMatch = atlassianPattern.match(spaceUrl);
+  if (atlassianMatch) {
+    return { valid: true, result: atlassianMatch };
+  }
+
+  // Custom Confluence URL match
+  const customPattern = new UrlPattern(
+    "https\\://(:subdomain.):domain.:tld/wiki/spaces/(:spaceKey)/*"
+  );
+  const customMatch = customPattern.match(spaceUrl);
+  if (customMatch) {
+    customMatch.customDomain =
+      (customMatch.subdomain ? `${customMatch.subdomain}.` : "") + //
+      `${customMatch.domain}.${customMatch.tld}`;
+    return { valid: true, result: customMatch, custom: true };
+  }
+
+  // No match
+  return { valid: false, result: null };
 }
 
 async function loadConfluence({ pageUrl, username, accessToken }) {
@@ -32,14 +49,19 @@ async function loadConfluence({ pageUrl, username, accessToken }) {
     return {
       success: false,
       reason:
-        "Confluence space URL is not in the expected format of https://domain.atlassian.net/wiki/space/~SPACEID/*",
+        "Confluence space URL is not in the expected format of https://domain.atlassian.net/wiki/space/~SPACEID/* or https://customDomain/wiki/space/~SPACEID/*",
     };
   }
 
-  const { subdomain, spaceKey } = validSpace.result;
-  console.log(`-- Working Confluence ${subdomain}.atlassian.net --`);
+  const { subdomain, customDomain, spaceKey } = validSpace.result;
+  let baseUrl = `https://${subdomain}.atlassian.net/wiki`;
+  if (customDomain) {
+    baseUrl = `https://${customDomain}/wiki`;
+  }
+
+  console.log(`-- Working Confluence ${baseUrl} --`);
   const loader = new ConfluencePagesLoader({
-    baseUrl: `https://${subdomain}.atlassian.net/wiki`,
+    baseUrl,
     spaceKey,
     username,
     accessToken,
-- 
GitLab